python - UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128) -


i have taken code https://github.com/davidadamojr/textrank , facing problem. tried solve placing utf-8 in "keyphrases = decode('utf-8').extractkeyphrases(text)" failed.

here code:

""" paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/mihalcea.pdf   external dependencies: nltk, numpy, networkx  based on https://gist.github.com/voidfiles/1646117 """  import nltk import itertools operator import itemgetter import networkx nx import sys import os  #apply syntactic filters based on pos tags def filter_for_tags(tagged, tags=['nn', 'jj', 'nnp']):     return [item item in tagged if item[1] in tags]  def normalize(tagged):     return [(item[0].replace('.', ''), item[1]) item in tagged]  def unique_everseen(iterable, key=none):     "list unique elements, preserving order. remember elements ever seen."     # unique_everseen('aaaabbbccdaabbb') --> b c d     # unique_everseen('abbccad', str.lower) --> b c d     seen = set()     seen_add = seen.add     if key none:         element in itertools.ifilterfalse(seen.__contains__, iterable):             seen_add(element)             yield element     else:         element in iterable:             k = key(element)             if k not in seen:                 seen_add(k)                 yield element  def ldistance(firststring, secondstring):     "function find levenshtein distance between 2 words/sentences - gotten http://rosettacode.org/wiki/levenshtein_distance#python"     if len(firststring) > len(secondstring):         firststring, secondstring = secondstring, firststring     distances = range(len(firststring) + 1)     index2, char2 in enumerate(secondstring):         newdistances = [index2 + 1]         index1, char1 in enumerate(firststring):             if char1 == char2:                 newdistances.append(distances[index1])             else:                 newdistances.append(1 + min((distances[index1], distances[index1+1], newdistances[-1])))         distances = newdistances     return distances[-1]  def buildgraph(nodes):     "nodes - list of hashables represents nodes of graph"     gr = nx.graph() #initialize undirected graph     gr.add_nodes_from(nodes)     nodepairs = list(itertools.combinations(nodes, 2))      #add edges graph (weighted levenshtein distance)     pair in nodepairs:         firststring = pair[0]         secondstring = pair[1]         levdistance = ldistance(firststring, secondstring)         gr.add_edge(firststring, secondstring, weight=levdistance)      return gr  def extractkeyphrases(text):     #tokenize text using nltk     wordtokens = nltk.word_tokenize(text)      #assign pos tags words in text     tagged = nltk.pos_tag(wordtokens)     textlist = [x[0] x in tagged]      tagged = filter_for_tags(tagged)     tagged = normalize(tagged)      unique_word_set = unique_everseen([x[0] x in tagged])     word_set_list = list(unique_word_set)     #this used determine adjacent words in order construct keyphrases 2 words      graph = buildgraph(word_set_list)      #pagerank - initial value of 1.0, error tolerance of 0,0001,      calculated_page_rank = nx.pagerank(graph, weight='weight')      #most important words in ascending order of importance     keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=true)      #the number of keyphrases returned relative size of text (a third of number of vertices)     athird = len(word_set_list) / 3     keyphrases = keyphrases[0:athird+1]      #take keyphrases multiple words consideration done in paper - if 2 words adjacent in text , selected keywords, join them     #together     modifiedkeyphrases = set([])     dealtwith = set([]) #keeps track of individual keywords have been joined form keyphrase     = 0     j = 1     while j < len(textlist):         firstword = textlist[i]         secondword = textlist[j]         if firstword in keyphrases , secondword in keyphrases:             keyphrase = firstword + ' ' + secondword             modifiedkeyphrases.add(keyphrase)             dealtwith.add(firstword)             dealtwith.add(secondword)         else:             if firstword in keyphrases , firstword not in dealtwith:                  modifiedkeyphrases.add(firstword)              #if last word in text, , keyword,             #it has no chance of being keyphrase @ point                 if j == len(textlist)-1 , secondword in keyphrases , secondword not in dealtwith:                 modifiedkeyphrases.add(secondword)          = + 1         j = j + 1      return modifiedkeyphrases  def extractsentences(text):     sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')     sentencetokens = sent_detector.tokenize(text.strip())     graph = buildgraph(sentencetokens)      calculated_page_rank = nx.pagerank(graph, weight='weight')      #most important sentences in ascending order of importance     sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=true)      #return 100 word summary     summary = ' '.join(sentences)     summarywords = summary.split()     summarywords = summarywords[0:101]     summary = ' '.join(summarywords)      return summary  def writefiles(summary, keyphrases, filename):     "outputs keyphrases , summaries appropriate files"     print "generating output " + 'keywords/' + filename     keyphrasefile = open('keywords/' + filename, 'w')     keyphrase in keyphrases:         keyphrasefile.write(keyphrase + '\n')     keyphrasefile.close()      print "generating output " + 'summaries/' + filename     summaryfile = open('summaries/' + filename, 'w')     summaryfile.write(summary)     summaryfile.close()      print "-"   #retrieve each of articles articles = os.listdir("articles") article in articles:     print 'reading articles/' + article     articlefile = open('articles/' + article, 'r')     text = articlefile.read()     keyphrases = decode('utf-8').extractkeyphrases(text)     summary = extractsentences(text)     writefiles(summary, keyphrases, article) 

error:

reading articles/1.txt  traceback (most recent call last):   file "c:\users\dell\desktop\python\s\fyp\relevancy\test\textrank-master\textrank.py", line 166, in <module>     keyphrases = extractkeyphrases(text).setdefaultencoding("utf-8")   file "c:\users\dell\desktop\python\s\fyp\relevancy\test\textrank-master\textrank.py", line 72, in extractkeyphrases     wordtokens = nltk.word_tokenize(text)   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 93, in word_tokenize     return [token sent in sent_tokenize(text)   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 82, in sent_tokenize     return tokenizer.tokenize(text)   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1270, in tokenize     return list(self.sentences_from_text(text, realign_boundaries))   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1318, in sentences_from_text     return [text[s:e] s, e in self.span_tokenize(text, realign_boundaries)]   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1309, in span_tokenize     return [(sl.start, sl.stop) sl in slices]   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1348, in _realign_boundaries     sl1, sl2 in _pair_iter(slices):   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter     prev = next(it)   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1324, in _slices_from_text     if self.text_contains_sentbreak(context):   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1369, in text_contains_sentbreak     t in self._annotate_tokens(self._tokenize_words(text)):   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1504, in _annotate_second_pass     t1, t2 in _pair_iter(tokens):   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter     prev = next(it)   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 621, in _annotate_first_pass     aug_tok in tokens:   file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 586, in _tokenize_words     line in plaintext.split('\n'): unicodedecodeerror: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128) 

any idea? (sorry bad english)

i think looking is:

# ... text = articlefile.read().decode('utf-8') keyphrases = extractkeyphrases(text) # ... 

basicly want decode unicode string contents of file read it. rest of program save conversion problems. please make sure file in utf-8 encoding. if unsure try latin1 encoding because never throw exception while decoding (but still produces wrong text of course when file not in latin1 encoding)


Comments

Popular posts from this blog

c# - Validate object ID from GET to POST -

node.js - Custom Model Validator SailsJS -

php - Find a regex to take part of Email -