Extract paragraphs instead of sentences with Python Tika

Extract paragraphs instead of sentences with Python Tika - python

I've been trying around to solve my issue, but nothing seems to be working - I need your help!
I found code that is quite effective at parsing a PDF (e.g. annual report) and extracting sentences from it. I was, however, wondering how I could extract paragraphs from this instead of sentences.
My intuition would be to split paragraphs by not removing the double "\n", i.e. "\n\n", which could indicate the start of a new paragraph. I haven't quite managed to get it to work. Does anyone have any tips?
#PDF parsing
class parsePDF:
def __init__(self, url):
self.url = url
def extract_contents(self):
""" Extract a pdf's contents using tika. """
pdf = parser.from_file(self.url)
self.text = pdf["content"]
return self.text
def clean_text(self):
""" Extract & clean sentences from raw text of pdf. """
# Remove non ASCII characters
printables = set(string.printable)
self.text = "".join(filter(lambda x: x in printables, self.text))
# Replace tabs with spaces
self.text = re.sub(r"\t+", r" ", self.text)
# Aggregate lines where the sentence wraps
# Also, lines in CAPITALS is counted as a header
fragments = []
prev = ""
for line in re.split(r"\n+", self.text):
if line.isupper():
prev = "." # skip it
elif line and (line.startswith(" ") or line[0].islower()
or not prev.endswith(".")):
prev = f"{prev} {line}" # make into one line
else:
fragments.append(prev)
prev = line
fragments.append(prev)
# Clean the lines into sentences
sentences = []
for line in fragments:
# Use regular expressions to clean text
url_str = (r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:#\-_=#]+\."
r"([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:#\-_=#])*")
line = re.sub(url_str, r" ", line) # URLs
line = re.sub(r"^\s?\d+(.*)$", r"\1", line) # headers
line = re.sub(r"\d{5,}", r" ", line) # figures
line = re.sub(r"\.+", ".", line) # multiple periods
line = line.strip() # leading & trailing spaces
line = re.sub(r"\s+", " ", line) # multiple spaces
line = re.sub(r"\s?([,:;\.])", r"\1", line) # punctuation spaces
line = re.sub(r"\s?-\s?", "-", line) # split-line words
# Use nltk to split the line into sentences
for sentence in nltk.sent_tokenize(line):
s = str(sentence).strip().lower() # lower case
# Exclude tables of contents and short sentences
if "table of contents" not in s and len(s) > 5:
sentences.append(s)
return sentences
And this is what you can call after
url = URL OF ANY PDF YOU WANT (e.g. https://www.aperam.com/sites/default/files/documents/Annual_Report_2021.pdf)
pp = parsePDF(url)
pp.extract_contents()
sentences = pp.clean_text()
All recommendations are greatly appreciated!
PS: If anyone has a better solution already created, I'd be more than happy to have a look at it!

Related

What Is the error in the code, i want to replace a set of characters from a text file when i give a work with blanks in it

i want to replace a set of characters from a text file when i give a work with blanks in it like for example :
i gave the line The Language Is _th_n !
it should return python replacing _ with text from a file like text.txt
i wrote this code please check once
with open('data/text','r', encoding='utf8') as file:
word_list = file.read()
def solve(message):
hint = []
for i in range(15,len(message) - 1):
if message[i] != '\\':
hint.append(message[i])
hint_string = ''
for i in hint:
hint_string += i
hint_replaced = hint_string.replace('_', '!')
solution = re.findall('^'+hint_replaced+'$', word_list, re.MULTILINE)
return solution```

python: extract parts from line when using different delimiter

I am reading stdin line by line:
for line in sys.stdin:
...
Each line has following format:
: 1631373881:0;echo
I need to extract the first number (epoch time) and the command (last part after ';')
How can I extract these when the delimiter is not the same?

input_str = ": 1631373881:0;echo".split(";")
command = input_str[-1]
number = input_str[0].split(":")[1].replace(" ","")

If you know the lines always have the same format, you can bet on regular expressions:
import re
MASK = re.compile(': (\\d+):\\d+;(.+)')
def extract(line):
matches = MASK.findall(line)
return matches[0] if matches else None
def test():
assert extract(": 1631373881:0;echo test") == ("1631373881", "echo test")

How to scrape data from PDF into Excel

I am trying to scrape the data from PDF and get it saved into an excel file. This is the pdf I needed: https://www.medicaljournals.se/acta/content_files/files/pdf/98/219/Suppl219.pdf
However, I need to scrape not all the data but the following one (below), and then saved it to excel in different cells:
From page 5, starting from P001 to and including Introduction - there is a P number, title, people names, and Introduction.
For now, I can only convert a PDF file into text (my code below) and save it all in one cell, but I need it to be separated into a different cells
import PyPDF2 as p2
PDFfile = open('Abstract Book from the 5th World Psoriasis and Psoriatic Arthritis
Conference 2018.pdf', 'rb')
pdfread = p2.PdfFileReader(PDFfile)
pdflist = []
i = 6
while i<pdfread.getNumPages():
pageinfo = pdfread.getPage(i)
#print(pageinfo.extractText())
i = i + 1
pdflist.append(pageinfo.extractText().replace('\n', ''))
print(pdflist)

The main you need is 'header' regex as 15 UPPERcase letters and 'article' regex letter 'P' and 3 digits.
One more regex helps you to divide your text by any of keywords
article_re = re.compile(r'[P]\d{3}') #P001: letter 'P' and 3 digits
header_re = re.compile(r'[A-Z\s\-]{15,}|$') #min 15 UPPERCASE letters, including '\n' '-' and
key_word_delimeters = ['Peoples', 'Introduction','Objectives','Methods','Results','Conclusions','References']
file = open('data.pdf', 'rb')
pdf = pdf.PdfFileReader(file)
text = ''
for i in range(6, 63):
text += pdf.getPage(i).extractText() # all text in one variable
articles = []
for article in re.split(article_re, text):
header = re.match(header_re, article) # recieving a match
other_text = re.split(header_re, article)[1] # recieving other text
if header:
header = header.group() # get text from match
item = {'header': header}
first_name_letter = header[-1] # save the first letter of name to put it in right position. Some kind of HOT BUGFIX
header = header[:-1] # cut last character: the first letter of name
header = header.replace('\n', '') #delete linebreakers
header = header.replace('-', '') #delete line break symbol
other_text = first_name_letter + other_text
data_array = re.split(
'Introduction:|Objectives:|Methods:|Results:|Conclusions:|References:',
other_text)
for key, data in zip(key_word_delimeters, data_array):
item[key] = data.replace('\n', '')
articles.append(item)

Print line only if found match of regex is between whitespace in Python

I have a file data.txt:
<tag,1>moon sunlightcream potato</tag>
<tag,2>dishes light jellybeans</tag>
and a python file match.py:
for LINE in open("data.txt"):
STRING = "light"
if STRING in LINE:
print (LINE)
The output is:
<tag,1>moon sunlightcream potato</tag>
<tag,2>dishes light jellybeans</tag>
I want only:
dishes light jellybeans
How can I do that ?
The larger context is:
TAG = QUERY.split("&")[1]
LIST = []
for LINE in open(DATA):
STRING = "<tag,"
if STRING in LINE:
if TAG in LINE:
print LINE
So I can't so it seems do " light " ! Because "light" is a variable. So I can't do so it seems: " light "
the regex option was:
import re
def sub_list():
TAG = "light"
p_number = re.compile(r'<tag,.*,' + TAG + ',.*,>')
for LINE in open(DATA):
match = p_number.findall(LINE)
if match:
print LINE
But that doesn't help also.
But now it works with:
import re
TAG = "light"
for LINE in open(DATA):
STRING = "<tag,"
if STRING in LINE:
if re.search(r'\b{}\b'.format(TAG), LINE):
print (LINE)

You can use regex as below, \b match the word boundary, it match only at the beginning or end of a word, so it wouldn't match light if its a substring
import re
LINES = ['moon sunlightcream potato', 'dishes light jellybeans']
match_tag = 'light'
for LINE in LINES:
# you could also use re.search(r'\b' + match_tag + r'\b', LINE)
if re.search(r'\b{}\b'.format(match_tag), LINE):
print (LINE)
# only print 'dishes light jellybeans'

Excel List iteration

I am working on a text search project. I have 2 lists.
a = ['ibm','dell']
b =['strength','keyword']##this is a list of keywords given by the user
Now i create combinations for searching google.
lst = list(itertools.product(a, b))
What i need help on is below:
using the code i will search for the text using different keywords and their lemma. After that I need to write the searched text to an excel file. I need to create worksheets with the names in list A and write it only the searched text in different worksheets. I am not able to figure. below is part of my code.
def getarticle(url,n):
final =[]
regex ='(.*).pdf'
pattern = re.compile(regex)
if re.match(pattern,url) is not None:
text = pdf_to_text(url)
final.append('')
final.append(url)
final.append(text)
New_file = open((('text' + str((round(random.random(),2))) + '.txt')),'w+')
New_file.write(smart_str(unicode(text,'utf-8')))
New_file.close()
else:
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent','Chrome')]
html = br.open(url).read()
titles = br.title()
readable_article= Document(html).summary()
readable_title = Document(html).short_title()
soup = bs4.BeautifulSoup(readable_article)
Final_Article = soup.text
final.append(titles)
final.append(url)
final.append(Final_Article)
raw = nltk.clean_html(html)
cleaned = re.sub(r'& ?(ld|rd)quo ?[;\]]', '\"', raw)
tokens = nltk.wordpunct_tokenize(raw)
lmtzr = WordNetLemmatizer()
t = [lmtzr.lemmatize(t) for t in tokens]
text = nltk.Text(t)
word = words(n)
find = ' '.join(str(e) for e in word)
search_words = set(find.split(' '))
sents = ' '.join([s.lower() for s in text])
blob = TextBlob(sents.decode('ascii','ignore'))
matches = [map(str, blob.sentences[i-1:i+2]) # from prev to after next
for i, s in enumerate(blob.sentences) # i is index, e is element
if search_words & set(s.words)]
return ''.join (str(y).replace('& rdquo','').replace('& rsquo','') for y in matches)
This returns the text now i need to write to excel files which i am unable to code.

As far as writing text out to a file Excel can read is concerned, ou might want to look at Python's csv library, which provides lots of useful .csv manipulation tools.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract paragraphs instead of sentences with Python Tika - python

Related

What Is the error in the code, i want to replace a set of characters from a text file when i give a work with blanks in it

python: extract parts from line when using different delimiter

How to scrape data from PDF into Excel

Print line only if found match of regex is between whitespace in Python

Excel List iteration

Categories

Resources