Extract paragraphs instead of sentences with Python Tika - python

I've been trying around to solve my issue, but nothing seems to be working - I need your help!
I found code that is quite effective at parsing a PDF (e.g. annual report) and extracting sentences from it. I was, however, wondering how I could extract paragraphs from this instead of sentences.
My intuition would be to split paragraphs by not removing the double "\n", i.e. "\n\n", which could indicate the start of a new paragraph. I haven't quite managed to get it to work. Does anyone have any tips?
#PDF parsing
class parsePDF:
def __init__(self, url):
self.url = url
def extract_contents(self):
""" Extract a pdf's contents using tika. """
pdf = parser.from_file(self.url)
self.text = pdf["content"]
return self.text
def clean_text(self):
""" Extract & clean sentences from raw text of pdf. """
# Remove non ASCII characters
printables = set(string.printable)
self.text = "".join(filter(lambda x: x in printables, self.text))
# Replace tabs with spaces
self.text = re.sub(r"\t+", r" ", self.text)
# Aggregate lines where the sentence wraps
# Also, lines in CAPITALS is counted as a header
fragments = []
prev = ""
for line in re.split(r"\n+", self.text):
if line.isupper():
prev = "." # skip it
elif line and (line.startswith(" ") or line[0].islower()
or not prev.endswith(".")):
prev = f"{prev} {line}" # make into one line
else:
fragments.append(prev)
prev = line
fragments.append(prev)
# Clean the lines into sentences
sentences = []
for line in fragments:
# Use regular expressions to clean text
url_str = (r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:#\-_=#]+\."
r"([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:#\-_=#])*")
line = re.sub(url_str, r" ", line) # URLs
line = re.sub(r"^\s?\d+(.*)$", r"\1", line) # headers
line = re.sub(r"\d{5,}", r" ", line) # figures
line = re.sub(r"\.+", ".", line) # multiple periods
line = line.strip() # leading & trailing spaces
line = re.sub(r"\s+", " ", line) # multiple spaces
line = re.sub(r"\s?([,:;\.])", r"\1", line) # punctuation spaces
line = re.sub(r"\s?-\s?", "-", line) # split-line words
# Use nltk to split the line into sentences
for sentence in nltk.sent_tokenize(line):
s = str(sentence).strip().lower() # lower case
# Exclude tables of contents and short sentences
if "table of contents" not in s and len(s) > 5:
sentences.append(s)
return sentences
And this is what you can call after
url = URL OF ANY PDF YOU WANT (e.g. https://www.aperam.com/sites/default/files/documents/Annual_Report_2021.pdf)
pp = parsePDF(url)
pp.extract_contents()
sentences = pp.clean_text()
All recommendations are greatly appreciated!
PS: If anyone has a better solution already created, I'd be more than happy to have a look at it!

Related

What Is the error in the code, i want to replace a set of characters from a text file when i give a work with blanks in it

i want to replace a set of characters from a text file when i give a work with blanks in it like for example :
i gave the line The Language Is _th_n !
it should return python replacing _ with text from a file like text.txt
i wrote this code please check once
with open('data/text','r', encoding='utf8') as file:
word_list = file.read()
def solve(message):
hint = []
for i in range(15,len(message) - 1):
if message[i] != '\\':
hint.append(message[i])
hint_string = ''
for i in hint:
hint_string += i
hint_replaced = hint_string.replace('_', '!')
solution = re.findall('^'+hint_replaced+'$', word_list, re.MULTILINE)
return solution```

python: extract parts from line when using different delimiter

I am reading stdin line by line:
for line in sys.stdin:
...
Each line has following format:
: 1631373881:0;echo
I need to extract the first number (epoch time) and the command (last part after ';')
How can I extract these when the delimiter is not the same?
input_str = ": 1631373881:0;echo".split(";")
command = input_str[-1]
number = input_str[0].split(":")[1].replace(" ","")
If you know the lines always have the same format, you can bet on regular expressions:
import re
MASK = re.compile(': (\\d+):\\d+;(.+)')
def extract(line):
matches = MASK.findall(line)
return matches[0] if matches else None
def test():
assert extract(": 1631373881:0;echo test") == ("1631373881", "echo test")

How to scrape data from PDF into Excel

I am trying to scrape the data from PDF and get it saved into an excel file. This is the pdf I needed: https://www.medicaljournals.se/acta/content_files/files/pdf/98/219/Suppl219.pdf
However, I need to scrape not all the data but the following one (below), and then saved it to excel in different cells:
From page 5, starting from P001 to and including Introduction - there is a P number, title, people names, and Introduction.
For now, I can only convert a PDF file into text (my code below) and save it all in one cell, but I need it to be separated into a different cells
import PyPDF2 as p2
PDFfile = open('Abstract Book from the 5th World Psoriasis and Psoriatic Arthritis
Conference 2018.pdf', 'rb')
pdfread = p2.PdfFileReader(PDFfile)
pdflist = []
i = 6
while i<pdfread.getNumPages():
pageinfo = pdfread.getPage(i)
#print(pageinfo.extractText())
i = i + 1
pdflist.append(pageinfo.extractText().replace('\n', ''))
print(pdflist)
The main you need is 'header' regex as 15 UPPERcase letters and 'article' regex letter 'P' and 3 digits.
One more regex helps you to divide your text by any of keywords
article_re = re.compile(r'[P]\d{3}') #P001: letter 'P' and 3 digits
header_re = re.compile(r'[A-Z\s\-]{15,}|$') #min 15 UPPERCASE letters, including '\n' '-' and
key_word_delimeters = ['Peoples', 'Introduction','Objectives','Methods','Results','Conclusions','References']
file = open('data.pdf', 'rb')
pdf = pdf.PdfFileReader(file)
text = ''
for i in range(6, 63):
text += pdf.getPage(i).extractText() # all text in one variable
articles = []
for article in re.split(article_re, text):
header = re.match(header_re, article) # recieving a match
other_text = re.split(header_re, article)[1] # recieving other text
if header:
header = header.group() # get text from match
item = {'header': header}
first_name_letter = header[-1] # save the first letter of name to put it in right position. Some kind of HOT BUGFIX
header = header[:-1] # cut last character: the first letter of name
header = header.replace('\n', '') #delete linebreakers
header = header.replace('-', '') #delete line break symbol
other_text = first_name_letter + other_text
data_array = re.split(
'Introduction:|Objectives:|Methods:|Results:|Conclusions:|References:',
other_text)
for key, data in zip(key_word_delimeters, data_array):
item[key] = data.replace('\n', '')
articles.append(item)

Print line only if found match of regex is between whitespace in Python

I have a file data.txt:
<tag,1>moon sunlightcream potato</tag>
<tag,2>dishes light jellybeans</tag>
and a python file match.py:
for LINE in open("data.txt"):
STRING = "light"
if STRING in LINE:
print (LINE)
The output is:
<tag,1>moon sunlightcream potato</tag>
<tag,2>dishes light jellybeans</tag>
I want only:
dishes light jellybeans
How can I do that ?
The larger context is:
TAG = QUERY.split("&")[1]
LIST = []
for LINE in open(DATA):
STRING = "<tag,"
if STRING in LINE:
if TAG in LINE:
print LINE
So I can't so it seems do " light " ! Because "light" is a variable. So I can't do so it seems: " light "
the regex option was:
import re
def sub_list():
TAG = "light"
p_number = re.compile(r'<tag,.*,' + TAG + ',.*,>')
for LINE in open(DATA):
match = p_number.findall(LINE)
if match:
print LINE
But that doesn't help also.
But now it works with:
import re
TAG = "light"
for LINE in open(DATA):
STRING = "<tag,"
if STRING in LINE:
if re.search(r'\b{}\b'.format(TAG), LINE):
print (LINE)
You can use regex as below, \b match the word boundary, it match only at the beginning or end of a word, so it wouldn't match light if its a substring
import re
LINES = ['moon sunlightcream potato', 'dishes light jellybeans']
match_tag = 'light'
for LINE in LINES:
# you could also use re.search(r'\b' + match_tag + r'\b', LINE)
if re.search(r'\b{}\b'.format(match_tag), LINE):
print (LINE)
# only print 'dishes light jellybeans'

Excel List iteration

I am working on a text search project. I have 2 lists.
a = ['ibm','dell']
b =['strength','keyword']##this is a list of keywords given by the user
Now i create combinations for searching google.
lst = list(itertools.product(a, b))
What i need help on is below:
using the code i will search for the text using different keywords and their lemma. After that I need to write the searched text to an excel file. I need to create worksheets with the names in list A and write it only the searched text in different worksheets. I am not able to figure. below is part of my code.
def getarticle(url,n):
final =[]
regex ='(.*).pdf'
pattern = re.compile(regex)
if re.match(pattern,url) is not None:
text = pdf_to_text(url)
final.append('')
final.append(url)
final.append(text)
New_file = open((('text' + str((round(random.random(),2))) + '.txt')),'w+')
New_file.write(smart_str(unicode(text,'utf-8')))
New_file.close()
else:
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent','Chrome')]
html = br.open(url).read()
titles = br.title()
readable_article= Document(html).summary()
readable_title = Document(html).short_title()
soup = bs4.BeautifulSoup(readable_article)
Final_Article = soup.text
final.append(titles)
final.append(url)
final.append(Final_Article)
raw = nltk.clean_html(html)
cleaned = re.sub(r'& ?(ld|rd)quo ?[;\]]', '\"', raw)
tokens = nltk.wordpunct_tokenize(raw)
lmtzr = WordNetLemmatizer()
t = [lmtzr.lemmatize(t) for t in tokens]
text = nltk.Text(t)
word = words(n)
find = ' '.join(str(e) for e in word)
search_words = set(find.split(' '))
sents = ' '.join([s.lower() for s in text])
blob = TextBlob(sents.decode('ascii','ignore'))
matches = [map(str, blob.sentences[i-1:i+2]) # from prev to after next
for i, s in enumerate(blob.sentences) # i is index, e is element
if search_words & set(s.words)]
return ''.join (str(y).replace('& rdquo','').replace('& rsquo','') for y in matches)
This returns the text now i need to write to excel files which i am unable to code.
As far as writing text out to a file Excel can read is concerned, ou might want to look at Python's csv library, which provides lots of useful .csv manipulation tools.

Categories