python - UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128) -
i have taken code https://github.com/davidadamojr/textrank , facing problem. tried solve placing utf-8 in "keyphrases = decode('utf-8').extractkeyphrases(text)" failed.
here code:
""" paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/mihalcea.pdf external dependencies: nltk, numpy, networkx based on https://gist.github.com/voidfiles/1646117 """ import nltk import itertools operator import itemgetter import networkx nx import sys import os #apply syntactic filters based on pos tags def filter_for_tags(tagged, tags=['nn', 'jj', 'nnp']): return [item item in tagged if item[1] in tags] def normalize(tagged): return [(item[0].replace('.', ''), item[1]) item in tagged] def unique_everseen(iterable, key=none): "list unique elements, preserving order. remember elements ever seen." # unique_everseen('aaaabbbccdaabbb') --> b c d # unique_everseen('abbccad', str.lower) --> b c d seen = set() seen_add = seen.add if key none: element in itertools.ifilterfalse(seen.__contains__, iterable): seen_add(element) yield element else: element in iterable: k = key(element) if k not in seen: seen_add(k) yield element def ldistance(firststring, secondstring): "function find levenshtein distance between 2 words/sentences - gotten http://rosettacode.org/wiki/levenshtein_distance#python" if len(firststring) > len(secondstring): firststring, secondstring = secondstring, firststring distances = range(len(firststring) + 1) index2, char2 in enumerate(secondstring): newdistances = [index2 + 1] index1, char1 in enumerate(firststring): if char1 == char2: newdistances.append(distances[index1]) else: newdistances.append(1 + min((distances[index1], distances[index1+1], newdistances[-1]))) distances = newdistances return distances[-1] def buildgraph(nodes): "nodes - list of hashables represents nodes of graph" gr = nx.graph() #initialize undirected graph gr.add_nodes_from(nodes) nodepairs = list(itertools.combinations(nodes, 2)) #add edges graph (weighted levenshtein distance) pair in nodepairs: firststring = pair[0] secondstring = pair[1] levdistance = ldistance(firststring, secondstring) gr.add_edge(firststring, secondstring, weight=levdistance) return gr def extractkeyphrases(text): #tokenize text using nltk wordtokens = nltk.word_tokenize(text) #assign pos tags words in text tagged = nltk.pos_tag(wordtokens) textlist = [x[0] x in tagged] tagged = filter_for_tags(tagged) tagged = normalize(tagged) unique_word_set = unique_everseen([x[0] x in tagged]) word_set_list = list(unique_word_set) #this used determine adjacent words in order construct keyphrases 2 words graph = buildgraph(word_set_list) #pagerank - initial value of 1.0, error tolerance of 0,0001, calculated_page_rank = nx.pagerank(graph, weight='weight') #most important words in ascending order of importance keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=true) #the number of keyphrases returned relative size of text (a third of number of vertices) athird = len(word_set_list) / 3 keyphrases = keyphrases[0:athird+1] #take keyphrases multiple words consideration done in paper - if 2 words adjacent in text , selected keywords, join them #together modifiedkeyphrases = set([]) dealtwith = set([]) #keeps track of individual keywords have been joined form keyphrase = 0 j = 1 while j < len(textlist): firstword = textlist[i] secondword = textlist[j] if firstword in keyphrases , secondword in keyphrases: keyphrase = firstword + ' ' + secondword modifiedkeyphrases.add(keyphrase) dealtwith.add(firstword) dealtwith.add(secondword) else: if firstword in keyphrases , firstword not in dealtwith: modifiedkeyphrases.add(firstword) #if last word in text, , keyword, #it has no chance of being keyphrase @ point if j == len(textlist)-1 , secondword in keyphrases , secondword not in dealtwith: modifiedkeyphrases.add(secondword) = + 1 j = j + 1 return modifiedkeyphrases def extractsentences(text): sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sentencetokens = sent_detector.tokenize(text.strip()) graph = buildgraph(sentencetokens) calculated_page_rank = nx.pagerank(graph, weight='weight') #most important sentences in ascending order of importance sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=true) #return 100 word summary summary = ' '.join(sentences) summarywords = summary.split() summarywords = summarywords[0:101] summary = ' '.join(summarywords) return summary def writefiles(summary, keyphrases, filename): "outputs keyphrases , summaries appropriate files" print "generating output " + 'keywords/' + filename keyphrasefile = open('keywords/' + filename, 'w') keyphrase in keyphrases: keyphrasefile.write(keyphrase + '\n') keyphrasefile.close() print "generating output " + 'summaries/' + filename summaryfile = open('summaries/' + filename, 'w') summaryfile.write(summary) summaryfile.close() print "-" #retrieve each of articles articles = os.listdir("articles") article in articles: print 'reading articles/' + article articlefile = open('articles/' + article, 'r') text = articlefile.read() keyphrases = decode('utf-8').extractkeyphrases(text) summary = extractsentences(text) writefiles(summary, keyphrases, article)
error:
reading articles/1.txt traceback (most recent call last): file "c:\users\dell\desktop\python\s\fyp\relevancy\test\textrank-master\textrank.py", line 166, in <module> keyphrases = extractkeyphrases(text).setdefaultencoding("utf-8") file "c:\users\dell\desktop\python\s\fyp\relevancy\test\textrank-master\textrank.py", line 72, in extractkeyphrases wordtokens = nltk.word_tokenize(text) file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 93, in word_tokenize return [token sent in sent_tokenize(text) file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 82, in sent_tokenize return tokenizer.tokenize(text) file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1270, in tokenize return list(self.sentences_from_text(text, realign_boundaries)) file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1318, in sentences_from_text return [text[s:e] s, e in self.span_tokenize(text, realign_boundaries)] file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1309, in span_tokenize return [(sl.start, sl.stop) sl in slices] file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1348, in _realign_boundaries sl1, sl2 in _pair_iter(slices): file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter prev = next(it) file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1324, in _slices_from_text if self.text_contains_sentbreak(context): file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1369, in text_contains_sentbreak t in self._annotate_tokens(self._tokenize_words(text)): file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1504, in _annotate_second_pass t1, t2 in _pair_iter(tokens): file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter prev = next(it) file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 621, in _annotate_first_pass aug_tok in tokens: file "c:\python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 586, in _tokenize_words line in plaintext.split('\n'): unicodedecodeerror: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128)
any idea? (sorry bad english)
i think looking is:
# ... text = articlefile.read().decode('utf-8') keyphrases = extractkeyphrases(text) # ...
basicly want decode unicode string contents of file read it. rest of program save conversion problems. please make sure file in utf-8 encoding. if unsure try latin1 encoding because never throw exception while decoding (but still produces wrong text of course when file not in latin1 encoding)
Comments
Post a Comment