I am trying to get arrays from the string. i want the texts as an output, the texts should be in array format.
And i got this error for line in txt.readline():
AttributeError: 'unicode' object has no attribute 'readline'
def de_gutenberger(filename):
with open(filename, 'rb') as f:
txt = f.read()
author, title = get_author_and_title(txt)
# get rid of header & footer
start_txt = "START OF THIS PROJECT GUTENBERG EBOOK"
end_txt = "END OF THIS PROJECT GUTENBERG EBOOK"
start = re.search('('+start_txt+').*?\n', txt)
start_ind = start.end() if start != None else 0
end = re.search('('+end_txt+').*?\n', txt)
end_ind = end.start() if end != None else len(txt)
word_string = stem_and_stop(nltk.word_tokenize(txt[start_ind:end_ind]))
return author, title, word_string
def get_books(folder, files):
authors = []
texts = []
titles = []
for i, f in enumerate(files):
print("Number {} of {}".format(i+1, len(files)))
author, title, txt = de_gutenberger(folder + f)
for line in txt.readlines():
y = [value for value in line.split()]
texts.append( y )
authors.append(author)
titles.append(title)
What does de_gutenberger do? It is returning a tuple, but the 3rd object is NOT a file object, but rather a unicode string.
Try this:
def get_books(folder, files):
authors = []
texts = []
titles = []
for i, f in enumerate(files):
print("Number {} of {}".format(i+1, len(files)))
author, title, txt = de_gutenberger(folder + f)
y = [value for value in txt.split()]
texts.append( y )
authors.append(author)
titles.append(title)
Related
I want to return all values from my function as string. The return value is now coming as example:
('EVENTS', ['test'], [], ['Alcohol'])
The code that returns the tuple is:
def convert(fname, pages=None,encoding='utf-8'):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
if len(text)>=500:
regex3=re.search(r"\d+(?:[.-]\w+)*\s*(General Information|Process validation|Justification of Specification(s)|Specification|Analytical Procedures|Validation of Analytical Procedures|Batch Analyses|Justification of Specification|Reference Standards or Materials|Container Closure Systems|Stability Summary and Conclusions|Post Approval Stability Protocol and Stability Commitment)",text,re.IGNORECASE)
if regex3:
org = []
with open('C:\\Users\\Ak\\.spyder-py3\\org.csv', newline='', encoding ='latin1') as myFile:
reader = csv.reader(myFile)
for row in reader:
if len(row[1])>=4:
v = re.search(r'\b' + re.escape(row[1]) + r'\b', text, re.IGNORECASE)
if v:
a = v.group(0)
org.append(a)
break
dosage = []
with open('C:\\Users\\Ak\\.spyder-py3\\do.csv', newline='', encoding ='latin1') as myFile:
reader = csv.reader(myFile)
for row in reader:
if len(row[1])>=4:
w = re.search(r'\b' + re.escape(row[1]) + r'\b', text, re.IGNORECASE)
if w:
b = w.group(0)
dosage.append(b)
break
substances = []
with open('C:\\Users\\Ak\\.spyder-py3\\sub.csv', newline='', encoding ='latin1') as myFile:
reader = csv.reader(myFile)
for row in reader:
if len(row[1])>=4:
z = re.search(r'\b' + re.escape(row[1]) + r'\b', text, re.IGNORECASE)
if z:
c=z.group(0)
substances.append(c)
break
o = regex3.group(1), org, dosage, substances
return o
From here I want to return the values as:
EVENTS,test, [], Alcohol
OR
EVENTS,test,,Alcohol
How can I format the return values as string
You can try to do the following:
o = regex3.group(1), "".join(org), "".join(dosage), "".join(substances)
o = list(o)
# Join list items using join()
str_list = ",".join(o)
# Option 1
# str_list = eval(str_list)
# OR
# Option 2
# str_list = str_list.replace("'", "")
return str_list
do tell if this works.
First I would make all my lists in the tuple to strings by the map function. Then I can easily create from a list of strings or a tuple of strings, a string which includes in it all of the list/tuple elements using the function join.
Here is an example:
def convert_2_str(n):
if type(n)==list:
return ','.join(n)
elif type(n)==str:
return n
a=('a','',['b'],'c')
a=list(map(convert_2_str,a))
print(','.join(a))
and the output would be:
'a,,b,c'
If you liked the solution, please upvote it :).
PhraseMatcher returns empty list while trying to tag more than one item inside a document. I am passing values from a column in a csv to tag product names inside the document but PhraseMatcher is only tagging for rows with one string. When there are more than one elements for tagging, it's returning an empty list. What's the solution for this
import spacy
import re
import csv
from spacy.matcher import PhraseMatcher
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
#nlp = spacy.blank('en')
nlp = spacy.load('en')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'C:\\Users\\Lenovo\\.spyder-py3\\smoke\\'
matcher = PhraseMatcher(nlp.vocab)
list_str_index = []
to_train_ents = []
with open('qq1.csv', newline='') as myFile:
reader = csv.reader(myFile)
for row in reader:
try:
product = row[0].lower()
filename = row[1]
#print(product)
file = open(DIR+filename, "r", encoding ='utf-8')
filecontents = file.read()
#print(product)
for s in filecontents:
filecontents = re.sub(r'\[[0-9]*\]', ' ', filecontents)
filecontents = re.sub(r'\s+', ' ', filecontents)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontents = ''.join([line.lower() for line in filecontents])
print(product.split())
matcher.add('PRODUCT', None, nlp(product))
doc = nlp(filecontents)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filecontents, dict(entities=list_str_index)))
break
except Exception as e:
#print(e)
pass`
The "to_train_ents" list is returning an empty list when there are more than one elements to tag inside the doc. The elements to tag are provided in the form of a csv as attached below:
`SAMPLE CSV:
PRODUCT FILES
ABC XXXX
ABC2, ABC3, BCA3 XXXX
BC2 XXXX
So, in the case of the 2nd row, it's returning an empty list in the to_train_ents list. What's the solution to tag such cases in the doc
AFTER REMOVAL OF TRY AND EXCEPT BLOCK:
This time I only kept in my csv the product names and the filenames which are there in the directory which contains the files for tagging. But still when I sent a product name for tagging which has two names to tag in the doc, it returns an empty list.
SAMPLE CODE NOW:
import spacy
import re
import csv
from spacy.matcher import PhraseMatcher
#Function to convert PhraseMatcher return value to string indexes
def str_index_conversion(lbl, doc, matchitem):
o_one = len(str(doc[0:matchitem[1]]))
subdoc = doc[matchitem[1]:matchitem[2]]
o_two = o_one + len(str(subdoc))
return (o_one, o_two, lbl)
#nlp = spacy.blank('en')
nlp = spacy.load('en')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)
else:
ner = nlp.get_pipe('ner')
ner.add_label('PRODUCT')
DIR = 'C:\\Users\\Lenovo\\.spyder-py3\\sanity\\'
matcher = PhraseMatcher(nlp.vocab)
list_str_index = []
to_train_ents = []
with open('qq2.csv', newline='') as myFile:
reader = csv.reader(myFile)
for row in reader:
product = row[0].lower()
filename = row[1]
#print(product)
file = open(DIR+filename, "r", encoding ='utf-8')
filecontents = file.read()
#print(product)
for s in filecontents:
filecontents = re.sub(r'\[[0-9]*\]', ' ', filecontents)
filecontents = re.sub(r'\s+', ' ', filecontents)
#filecontents = filecontents.encode().decode('unicode-escape')
filecontents = ''.join([line.lower() for line in filecontents])
#print(product.split())
#product_patterns = [nlp(text) for text in product.split()]
matcher.add('PRODUCT', None, nlp(product))
doc = nlp(filecontents)
matches = matcher(doc)
#print(matches)
list_str_index = [str_index_conversion('PRODUCT', doc, x) for x in matches]
to_train_ents.append((filecontents, dict(entities=list_str_index)))
break
how to call function inside for loop with PYTHON
must call this funtion
def EE():
print("dd")
inside this
def upload_file(request):
if request.method == 'POST':
form = UploadFileForm(request.POST, request.FILES)
files = request.FILES.getlist('file_field')
fs = FileSystemStorage()
for f in files:
filename = fs.save(f.name, f)
ee=EE()
print(ee)
number_of_files=len(files)
uploaded_file_url = fs.url(filename)
return render(request, 'core/simple_upload.html', {
# 'uploaded_file_url': uploaded_file_url
})
The way you have written is correct. Since your function doesn't returns any value, I doubt whether you will receive the desired output.
Assuming the function to be called and the other are in the same scope.
def sample_function():
return "This is a sample function."
def main_function():
# function call
x = sample_function()
print(x)
# add your logic here.
Hope this will help.
def sentence_finder(text, word):
sentences = sent_tokenize(text)
return [sent for sent in sentences if word in word_tokenize(sent)]
def EE(filename,no_of_files):
for i in range(no_of_files):
try:
print('\n')
print(i+1)
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
# The while loop will read each page
while count < num_pages:
pageObj = pdfReader.getPage(count)
count += 1
text += pageObj.extractText()
# This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
# If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
else:
text = textract.process(filename, method='tesseract', language='eng')
# select relevnt section
# education qulification
textt = re.search(r'EDUCATION\n.*?SKILLS', text, re.DOTALL).group()
edu_qulification = textt[textt.find('\n') + 1:textt.rfind('\n')]
srt1=edu_qulification.lower()
# print(edu_qulification)
str12 = srt1.replace("\n", ". ")
str2 = str12.replace("m.s.", "master")
# print(str2)
syn = synonyms = wordnet.synsets('degree')
syn_set1 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('BACHELOR')
syn_set2 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('Master')
syn_set3 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listone = ['bsc','be', 'btech']
listtwo =['m.s.']
mergedlist = listone + syn_set1 + syn_set2 + syn_set3 + listtwo
# print(mergedlist)
for i in mergedlist:
sent_part=sentence_finder(str2,i)
# print(sent_part)
if not sent_part:
pass
else:
Digree = sentence_finder(str2, i)
synn = synonyms = wordnet.synsets('university')
syn_seta = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('institute')
syn_setb= list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('college')
syn_setc = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listthree=['center']
mergedlistt = listthree + syn_seta + syn_setb + syn_setc
# print(mergedlistt)
for j in mergedlistt:
sent_partt = sentence_finder(str2, j)
# print(sent_partt)
if not sent_partt:
pass
else:
University = sentence_finder(str2, j)
# Digree = sentence_finder(str2, 'BACHELOR')
# University = sentence_finder(str2, 'UNIVERSITY')
print(Digree)
print(University)
print(".................................................................")
# print(University)
except:
print("No Education Qualification mentioned")
For some reason my code(following) has brought up a Value Error which I cannot understand. Please evaluate my code too. You can find the project I am trying to do at
http://www.ocr.org.uk/Images/226767-unit-j276-03-programming-project-task-1-sample-non-exam-assessment.pdf
fileid = "details for nea.txt"
ID = []
surname = []
forename = []
dob = []
addr = []
addrT = []
addrTh = []
addrF = []
addrFi = []
homNum = []
gend = []
tutor = []
schoolEm = []
def Read():
file = open(fileid, "r")
Record = file.readline()
for line in Record:
line = line.strip()
A,B,C,D,E,F,G,H,I,J,K,L,M = line.split(',')
ID.append(A)
surname.append(B)
forename.append(C)
dob.append(D)
addr.append(E)
addrT.append(F)
addrTh.append(G)
addrF.append(H)
addrFi.append(I)
homNum.append(J)
gend.append(K)
tutor.append(L)
schoolEm.append(M)
file.close()
def Save():
Record = []
file = open(fileid,"w")
for i in range(len(ID)):
Record.append(ID[i] +","+surname[i]+","+forename[i]+","+dob[i]+","+addr[i]+","+addrT[i]+","+addrTh[i]+","+addrF[i]+","+addrFi[i]+","+homNum[i]+","+gend[i]+","+tutor[i]+","+schoolEm[i]+"\n")
file.writelines(Record)
file.close()
Read()
print(ID)
print(surname)
The Text File I used goes as following:
01,abe,fat,01/02/02,5,Stoney Lane,Stur,Dorset,DR101LM,0123,M,C,email#sc. The lists titled addr, addrT represent the different lines of address.
put last three lines inside main. Value error should go away
Here's the code I tried. It gave me a syntax error highlighting 'data'. Any help? The .txt file has 4 columns if that's of any help.
def file():
file = open('hsp.txt', 'r')
col = [] data = file.readlines()
for i in range(1,len(data)-1):
col.append(int(float(data[i].split(',')[5])))
return col
def hist(col):
handspan = []
for i in range(11):
handspan.append(0)
for i in (col):
handspan[i] += 1
return handspan
col = file()
handspan = hist(col)
print(col)
print(handspan)
It is because your line
col = [] data = file.readlines()
should be on two separate lines:
col = []
data = file.readlines()
You can try this, it worked for me.
Hence it is a histogram it yields a dictionary.
Better answers are welcome!
import string
def list_from_file(filename):
myfile = open(filename, 'r')
data = myfile.read().split()
col = []
for word in data:
col.append(word)
return col
def myhist(col):
hist = {}
for word in col:
word = word.lower()
word = word.strip(string.punctuation + string.whitespace)
hist[word] = hist.get(word, 0)+1
return hist
col = list_from_file('em.txt')
colf = myhist(col)
print(colf)