knitr + python plot tree diagram

knitr + python plot tree diagram - python

I am an R users who is beginning to work in Python. I am trying o use knitr to knit a Python file and capture a tree diagram but it is not working. Here is the .Rnw (Latex based) file I am trying to knit:
\documentclass{article}
\begin{document}
Hello world!
<<r test-python1, engine='python'>>=
x = 'hello, python world!'
print(x)
#
<<r test-python2, engine='python', echo=FALSE>>=
import nltk
from nltk.tree import *
from nltk.draw import tree
grammar = r"""
NP:
{<.*>+} # Chunk everything
}<VBD|IN>+{ # Chink sequences of VBD and IN
"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
result
#
<<python>>=
x = 'hello, python world!'
print(x.split(' '))
#
\end{document}
\end{document}
But all that is returned is:
It appears that nktl can't be found but I run it in spyder just fine and plot the tree diagram. What do I need to do to include this diagram in the Rnw file for output to a pdf? The x.split error indicates Python is found but I'm not importing correctly in knitr.
I am using Python 3.4.3 64 bit for Windows 7.

Not sure if knitr does Python graphics per: knitr: python engine output not in .md or .html
If not here's the way to solve this:
\documentclass{article}
\begin{document}
Hello world!
<<r test-python1, engine='python'>>=
x = 'hello, python world!'
print(x)
#
<<r test-python2, engine='python', echo=FALSE>>=
import nltk
from nltk import Tree
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget
dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])])
dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])])
vp = Tree('vp', [Tree('v', ['chased']), dp2])
vp = Tree('vp', [Tree('v', ['chased']), dp2])
sentence = Tree('s', [dp1, vp])
cf = CanvasFrame()
tc = TreeWidget(cf.canvas(),sentence)
cf.add_widget(tc,10,10) # (10,10) offsets
cf.print_to_file('cache/tree.ps')
cf.destroy()
#
<<r r-code>>=
fls <- file.path(getwd(), c('cache/tree.ps', 'cache/tree.png'))
system(sprintf('"C:/Program Files/ImageMagick-6.9.0-Q16/convert.exe" %s %s', fls[1], fls[2]))
#
\begin{figure}[!ht]
\centering
\includegraphics[scale=.71]{cache/tree.png}
\caption{yay!} \label{regexviz}}
\end{figure}
<<r test-python3, engine='python'>>=
x = 'hello, python world!'
print(x.split(' '))
#
\end{document}
\end{document}

Related

Keywords extraction in Python - How to handle hyphenated compound words

I'm trying to perform keyphrase extraction with Python, using KeyBert and pke PositionRank. You can see an extract of my code below.
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building resulting from the construction has been calculated for each stage in the life-cycle and is disclosed to investors and clients on demand" #text_cleaning(df_tassonomia.iloc[1077].text, sentence_adjustment, stop_words)
# Pke
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number = 5)
extractor.candidate_weighting(window = 10)
keyphrases = extractor.get_n_best(n=10)
print(keyphrases)
# KeyBert
kw_model = KeyBERT(model = "all-mpnet-base-v2")
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range = (1,5),
top_n=10
)
print("")
print(keyphrases_2)
and here the results:
[('cycle global warming potential', 0.44829175082921835), ('life', 0.17858359644549557), ('cycle', 0.15775994057934534), ('building', 0.09131084381406684), ('construction', 0.08860454878871142), ('investors', 0.05426710724030216), ('clients', 0.054111700289631526), ('stage', 0.045672396861507744), ('demand', 0.039158055731066406)]
[('cycle global warming potential', 0.5444), ('building', 0.4479), ('construction', 0.3476), ('investors', 0.1967), ('clients', 0.1519), ('demand', 0.1484), ('cycle', 0.1312), ('stage', 0.0931), ('life', 0.0847)]
I would like to handle hyphenated compound words (as life-cycle in the example) are considered as a unique word, but I cannot understand how to exclude the - from the words separators list.
Thank you in advance for any help.
Francesca

this could be a silly workaround but it may help :
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building
resulting from the construction has been calculated for each stage in
the life-cycle and is disclosed to investors and clients on demand"
# Pke
tokens = text.split()
orignal = set([x for x in tokens if "_" in x])
text = text.replace("-", "_")
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number=5)
extractor.candidate_weighting(window=10)
keyphrases = extractor.get_n_best(n=10)
keyphrases_replaced = []
for pair in keyphrases:
if "_" in pair[0] and pair[0] not in orignal:
keyphrases_replaced.append((pair[0].replace("_","-"),pair[1]))
else:
keyphrases_replaced.append(pair)
print(keyphrases_replaced)
# KeyBert
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range=(1, 5),
top_n=10
)
print("")
print(keyphrases_2)
the out put should look like this:
[('life-cycle global warming potential', 0.5511001220016548), ('life-cycle', 0.20123353586644233), ('construction', 0.11945270995269436), ('building', 0.10637157845606555), ('investors', 0.06675114967366767), ('stage', 0.05503532672910801), ('clients', 0.0507262942318816), ('demand', 0.05056281895492815)]
I hope this help :)

The issue has been fixed in the on the latest pke updates: https://github.com/boudinfl/pke/issues/195
import pke
extractor = pke.unsupervised.TopicRank()
extractor.load_document(input='BERT is a state-of-the-art model.', language='en')
extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
print(extractor.candidates.keys())
now returns this output:
dict_keys(['bert', 'state-of-the-art model'])

Custom encoding and decoding UTF special characters

Just for fun I've been embedding text in images. The following code is a distillation and demonstration of the encoding and decoding mechanism I am using.
class encChar:
def __init__(self,char):
self.p = self.enc(char)
def enc(self,char):
d = bin(ord(char)).split('b')[1]
while len(d)<8:
d = "0"+d
rdif = int(d[0])*4 + int(d[1])*2 + int(d[2])*1
gdif = int(d[3])*2 + int(d[4])*1
bdif = int(d[5])*4 + int(d[6])*2 + int(d[7])*1
return (rdif,gdif,bdif)
def dec(self):
dmap = {0:"000",1:"001",2:"010",3:"011",4:"100",5:"101",6:"110",7:"111"}
r = dmap[self.p[0]]
g = dmap[self.p[1]][1:]
b = dmap[self.p[2]]
n = int(r+g+b,2)
return chr(int(r+g+b,2))
testStr = """Languages
Deutsch
Español
Français
한국어
Italiano
Русский
Tagalog
Tiếng Việt
中文"""
result= ""
for line in testStr.split("\n"):
result+=line+"\n"
print(line)
print("".join([encChar(k).dec() for k in line]))
result+="".join([encChar(k).dec() for k in line])+"\n"
print()
result+="\n"
with open("errorop.txt","w",encoding="utf8") as op:
op.write(result)
Which produces the following document:
Languages
Languages
Deutsch
Deutsch
Español
Español
Français
Français
한국어
ÕÅ
Italiano
Italiano
Русский
Tagalog
Tagalog
Tiếng Việt
Tiõng Viöt
中文
Ë
As you can see several runes are altered by the process and I'm wondering how I can preserve them through this process.

Converting stanford dependencies in numbered format

I am using Stanford dependency parser and the I get the following output of the sentence
I shot an elephant in my sleep
>>>python dep_parsing.py
[((u'shot', u'VBD'), u'nsubj', (u'I', u'PRP')), ((u'shot', u'VBD'), u'dobj', (u'elephant', u'NN')), ((u'elephant', u'NN'), u'det', (u'an', u'DT')), ((u'shot', u'VBD'), u'nmod', (u'sleep', u'NN')), ((u'sleep', u'NN'), u'case', (u'in', u'IN')), ((u'sleep', u'NN'), u'nmod:poss', (u'my', u'PRP$'))]
However, I want the numbered tokens as output just as it is here
nsubj(shot-2, I-1)
root(ROOT-0, shot-2)
det(elephant-4, an-3)
dobj(shot-2, elephant-4)
case(sleep-7, in-5)
nmod:poss(sleep-7, my-6)
nmod(shot-2, sleep-7)
Here is my code till now.
from nltk.parse.stanford import StanfordDependencyParser
stanford_parser_dir = 'stanford-parser/'
eng_model_path = stanford_parser_dir + "stanford-parser-models/edu/stanford/nlp/models/lexparser/englishRNN.ser.gz"
my_path_to_models_jar = stanford_parser_dir + "stanford-parser-3.5.2-models.jar"
my_path_to_jar = stanford_parser_dir + "stanford-parser.jar"
dependency_parser = StanfordDependencyParser(path_to_jar=my_path_to_jar, path_to_models_jar=my_path_to_models_jar)
result = dependency_parser.raw_parse('I shot an elephant in my sleep')
dep = result.next()
a = list(dep.triples())
print a
How can I have such an output?

Write a recursive function that traverses your tree. As a first pass, just try assigning the numbers to the words.

nltk NER word extraction

I have checked previous related threads, but did not solve my issue. I have written code to get NER from text.
text = "Stallone jason's film Rocky was inducted into the National Film Registry as well as having its film props placed in the Smithsonian Museum."
tokenized = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokenized)
namedEnt = nltk.ne_chunk(tagged, binary = True)
print namedEnt
namedEnt = nltk.ne_chunk(tagged, binary = False)
which gives this short of result
(S
(NE Stallone/NNP)
jason/NN
's/POS
film/NN
(NE Rocky/NNP)
was/VBD
inducted/VBN
into/IN
the/DT
(NE National/NNP Film/NNP Registry/NNP)
as/IN
well/RB
as/IN
having/VBG
its/PRP$
film/NN
props/NNS
placed/VBN
in/IN
the/DT
(NE Smithsonian/NNP Museum/NNP)
./.)
while I expect only NE as a result, like
Stallone
Rockey
National Film Registry
Smithsonian Museum
how to achieve this?
UPDATE
result = ' '.join([y[0] for y in x.leaves()]) for x in namedEnt.subtrees() if x.node == "NE"
print result
gives syntext error, what is correct way to write this?
UPDATE2
text = "Stallone jason's film Rocky was inducted into the National Film Registry as well as having its film props placed in the Smithsonian Museum."
tokenized = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokenized)
namedEnt = nltk.ne_chunk(tagged, binary = True)
print namedEnt
np = [' '.join([y[0] for y in x.leaves()]) for x in namedEnt.subtrees() if x.node == "NE"]
print np
error:
np = [' '.join([y[0] for y in x.leaves()]) for x in namedEnt.subtrees() if x.node == "NE"]
File "/usr/local/lib/python2.7/dist-packages/nltk/tree.py", line 198, in _get_node
raise NotImplementedError("Use label() to access a node label.")
NotImplementedError: Use label() to access a node label.
so I tried with
np = [' '.join([y[0] for y in x.leaves()]) for x in namedEnt.subtrees() if x.label() == "NE"]
which gives emtpy result

The namedEnt returned is actually a Tree object which is a subclass of list. You can do the following to parse it:
[' '.join([y[0] for y in x.leaves()]) for x in namedEnt.subtrees() if x.node == "NE"]
Output:
['Stallone', 'Rocky', 'National Film Registry', 'Smithsonian Museum']
The binary flag is set to True will indicate only whether a subtree is NE or not, which is what we need above. When set to False it will give more information like whether the NE is an Organization, Person etc. For some reason, the result with flag On and Off don't seem to agree with one another.

UnicodeWarning: special characters in Tkinter

I have written a program in Tkinter (Python 2.7), a scrabblehelper in Norwegian which contains some special characters (æøå), which means my wordlist (ordliste) contains words with special characters.
When I run my function finnord(c*), it returns 'cd'. I am using an entry.get() to get the word to put in my function.
My problem is with the encoding of entry.get(). I have local coding UTF-8, but I get an UniCodeError when I am writing any special characters in my entrybox and matching them to my wordliste.
Here is my output.
Warning (from warnings module):
File "C:\pythonprog\scrabble\feud.py", line 46
if s not in liste and s in ordliste:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode -
interpreting them as being unequal
When i write in my shell:
> ordinn.get()
u'k\xf8**e'
> ordinn.get().encode('utf-8')
'k\xc3\xb8**e'
> print ordinn.get()
kø**e
> print ordinn.get().encode('utf-8')
kÃ¸**e
Anyone knows why I can't match ordinn.get() (entry) to my wordlist ?

I can reproduce the error this way:
% python
Python 2.7.2+ (default, Oct 4 2011, 20:03:08)
[GCC 4.6.1] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> 'k\xf8**e' in [u'k\xf8**e']
__main__:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
False
So perhaps s is a str object, and liste or ordliste contains unicode, or (as eryksun points out in the comments) vice versa. The solution is to decode the str objects (most likely with the utf-8 codec) to make them unicode.
If that does not help, please print out and post the output of
print(repr(s))
print(repr(liste))
print(repr(ordliste))
I believe the problem can be avoided by converting all strings to unicode.
When you generate ordliste from norsk.txt, use
codecs.open('norsk.txt','r','utf-8'):
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r','utf-8') as fil:
ordliste = [line.rstrip(u'\n') for line in fil]
Convert all user input to unicode as soon as possible:
def get_unicode(widget):
streng = widget.get()
try:
streng = streng.decode('utf-8')
except UnicodeEncodeError:
pass
return streng
So perhaps try this:
import Tkinter as tk
import tkMessageBox
import codecs
import itertools
import sys
alfabetet = (u"abcdefghijklmnopqrstuvwxyz"
u"\N{LATIN SMALL LETTER AE}"
u"\N{LATIN SMALL LETTER O WITH STROKE}"
u"\N{LATIN SMALL LETTER A WITH RING ABOVE}")
encoding = sys.stdin.encoding
with codecs.open('norsk.txt','r',encoding) as fil:
ordliste = set(line.rstrip(u'\n') for line in fil)
def get_unicode(widget):
streng = widget.get()
if isinstance(streng,str):
streng = streng.decode('latin-1')
return streng
def siord():
alfa=lagtabell()
try:
streng = get_unicode(ordinn)
ordene=finnord(streng,alfa)
if len(ordene) == 0:
# There are no words that match
tkMessageBox.showinfo('Dessverre..','Det er ingen ord som passer...')
else:
# Done: The words that fit the pattern
tkMessageBox.showinfo('Ferdig',
'Ordene som passer er:\n'+ordene.encode('utf-8'))
except Exception as err:
# There has been a mistake .. Check your word
print(repr(err))
tkMessageBox.showerror('ERROR','Det har skjedd en feil.. Sjekk ordet ditt.')
def finnord(streng,alfa):
liste = set()
for substitution in itertools.permutations(alfa,streng.count(u'*')):
s = streng
for ch in substitution:
s = s.replace(u'*',ch,1)
if s in ordliste:
liste.add(s)
liste = [streng]+list(liste)
return u','.join(liste)+u'.'
def lagtabell():
tinbox = get_unicode(bokstinn)
if not tinbox.isalpha():
alfa = alfabetet
else:
alfa = tinbox.lower()
return alfa
root = tk.Tk()
root.title('FeudHjelper av Martin Skow Røed')
root.geometry('400x250+450+200')
# root.iconbitmap('data/ikon.ico')
skrift1 = tk.Label(root,
text = '''\
Velkommen til FeudHjelper. Skriv inn de bokstavene du har, og erstatt ukjente med *.
F. eks: sl**ge
Det er kun lov til å bruke tre stjerner, altså tre ukjente bokstaver.''',
font = ('Verdana',8), wraplength=350)
skrift1.pack(pady = 5)
ordinn = tk.StringVar(None)
tekstboks = tk.Entry(root, textvariable = ordinn)
tekstboks.pack(pady = 5)
# What letters do you have? Eg "ahneki". Leave blank here if you want all the words.
skrift2 = tk.Label(root, text = '''Hvilke bokstaver har du? F. eks "ahneki". La det være blankt her hvis du vil ha alle ordene.''',
font = ('Verdana',8), wraplength=350)
skrift2.pack(pady = 10)
bokstinn = tk.StringVar(None)
tekstboks2 = tk.Entry(root, textvariable = bokstinn)
tekstboks2.pack()
knapp = tk.Button(text = 'Finn ord!', command = siord)
knapp.pack(pady = 10)
root.mainloop()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

knitr + python plot tree diagram - python

Related

Keywords extraction in Python - How to handle hyphenated compound words

Custom encoding and decoding UTF special characters

Converting stanford dependencies in numbered format

nltk NER word extraction

UnicodeWarning: special characters in Tkinter

Categories

Resources