I'm trying to get a python script that allows me to automatically retrieve an abstract from any PDF, as long as it contains one.
Some people would have an idea how to write an automated python script already, requiring only the input file, that would allow me to extract the abstract from that PDF?
Here is what I have already obtained. This script is not automated since it requires a specific word to delimit the text to extract...
pdfFileObj = open('3D Printing in Pharmaceutical Sector: An Overview.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
pagecontent = ""
while count < num_pages: #The while loop will read each page
#print(count)
pageObj = pdfReader.getPage(count)
count +=1
pagecontent += pageObj.extractText()
def between(value, a, b):
# Find and validate before-part.
pos_a = value.find(a)
if pos_a == -1: return ""
# Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return ""
# Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
desired = between(pagecontent,"Abstract","Keywords")
print('The abstract of the document is :' + desired)
text = desired.encode('ascii','ignore').lower() # It returns an utf-8 encoded version of the string & Lowercasing each word
text = text.decode('ISO-8859-1')
keywords = re.findall(r'[a-zA-Z]\w+',text)
Related
i would like to get the radio-button / checkbox information from a pdf-document -
I had a look at pdfplumber and pypdf2 - but was not able to find a solution with this modules.
I can parse the text using this code - but for the radio-buttons i get only the text - but no information which button (or checkbox) is selected.
import pdfplumber
import os
import sys
if __name__ == '__main__':
path = os.path.abspath(os.path.dirname(sys.argv[0]))
fn = os.path.join(path, "input.pdf")
pdf = pdfplumber.open(fn)
page = pdf.pages[0]
text = page.extract_text()
I have also uploaded an example file here:
https://easyupload.io/8y8k2v
Is there any way to get this information from the pdf-file using python?
I think i found a solution using pdfplumber -
(probably not elegant - but i can check if the radio-buttons are selected or not)
Generally:
i read all chars and all curves for all pages
then i sort all elements by x and y (to get the chars and elements in the correct order like in the pdf)
then i concatenate the cars and add also blanks when the distance between the chars is longer than in a word
i check the pts-information for the carves and get so the information if the radio button is selected or not
the final lines and yes/not informatin i store in a list line-by-line for furhter working
import pdfplumber
import os
import sys
fn = os.path.join(path, "input.pdf")
pdf = pdfplumber.open(fn)
finalContent = []
for idx,page in enumerate(pdf.pages, start=1):
print(f"Reading page {idx}")
contList = []
for e in page.chars:
tmpRow = ["char", e["text"], e["x0"], e["y0"]]
contList.append(tmpRow)
for e in page.curves:
tmpRow = ["curve", e["pts"], e["x0"], e["y0"]]
contList.append(tmpRow)
contList.sort(key=lambda x: x[2])
contList.sort(key=lambda x: x[3], reverse=True)
workContent = []
workText = ""
workDistCharX = False
for e in contList:
if e[0] == "char":
if workDistCharX != False and \
(e[2] - workDistCharX > 20 or e[3] - workDistCharY < -2):
workText += " / "
workText += e[1]
workDistCharX = e[2]
workDistCharY = e[3]
continue
if e[0] == "curve":
if workText != "":
workContent.append(workText)
workText = ""
if e[1][0][0] < 100:
tmpVal = "SELECT-YES"
else:
tmpVal = "SELECT-NO"
workContent.append(f"CURVE {tmpVal}, None, None")
finalContent.extend(workContent)
workContent = "\n".join(workContent)
I am trying to scrape the data from PDF and get it saved into an excel file. This is the pdf I needed: https://www.medicaljournals.se/acta/content_files/files/pdf/98/219/Suppl219.pdf
However, I need to scrape not all the data but the following one (below), and then saved it to excel in different cells:
From page 5, starting from P001 to and including Introduction - there is a P number, title, people names, and Introduction.
For now, I can only convert a PDF file into text (my code below) and save it all in one cell, but I need it to be separated into a different cells
import PyPDF2 as p2
PDFfile = open('Abstract Book from the 5th World Psoriasis and Psoriatic Arthritis
Conference 2018.pdf', 'rb')
pdfread = p2.PdfFileReader(PDFfile)
pdflist = []
i = 6
while i<pdfread.getNumPages():
pageinfo = pdfread.getPage(i)
#print(pageinfo.extractText())
i = i + 1
pdflist.append(pageinfo.extractText().replace('\n', ''))
print(pdflist)
The main you need is 'header' regex as 15 UPPERcase letters and 'article' regex letter 'P' and 3 digits.
One more regex helps you to divide your text by any of keywords
article_re = re.compile(r'[P]\d{3}') #P001: letter 'P' and 3 digits
header_re = re.compile(r'[A-Z\s\-]{15,}|$') #min 15 UPPERCASE letters, including '\n' '-' and
key_word_delimeters = ['Peoples', 'Introduction','Objectives','Methods','Results','Conclusions','References']
file = open('data.pdf', 'rb')
pdf = pdf.PdfFileReader(file)
text = ''
for i in range(6, 63):
text += pdf.getPage(i).extractText() # all text in one variable
articles = []
for article in re.split(article_re, text):
header = re.match(header_re, article) # recieving a match
other_text = re.split(header_re, article)[1] # recieving other text
if header:
header = header.group() # get text from match
item = {'header': header}
first_name_letter = header[-1] # save the first letter of name to put it in right position. Some kind of HOT BUGFIX
header = header[:-1] # cut last character: the first letter of name
header = header.replace('\n', '') #delete linebreakers
header = header.replace('-', '') #delete line break symbol
other_text = first_name_letter + other_text
data_array = re.split(
'Introduction:|Objectives:|Methods:|Results:|Conclusions:|References:',
other_text)
for key, data in zip(key_word_delimeters, data_array):
item[key] = data.replace('\n', '')
articles.append(item)
how to call function inside for loop with PYTHON
must call this funtion
def EE():
print("dd")
inside this
def upload_file(request):
if request.method == 'POST':
form = UploadFileForm(request.POST, request.FILES)
files = request.FILES.getlist('file_field')
fs = FileSystemStorage()
for f in files:
filename = fs.save(f.name, f)
ee=EE()
print(ee)
number_of_files=len(files)
uploaded_file_url = fs.url(filename)
return render(request, 'core/simple_upload.html', {
# 'uploaded_file_url': uploaded_file_url
})
The way you have written is correct. Since your function doesn't returns any value, I doubt whether you will receive the desired output.
Assuming the function to be called and the other are in the same scope.
def sample_function():
return "This is a sample function."
def main_function():
# function call
x = sample_function()
print(x)
# add your logic here.
Hope this will help.
def sentence_finder(text, word):
sentences = sent_tokenize(text)
return [sent for sent in sentences if word in word_tokenize(sent)]
def EE(filename,no_of_files):
for i in range(no_of_files):
try:
print('\n')
print(i+1)
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
# The while loop will read each page
while count < num_pages:
pageObj = pdfReader.getPage(count)
count += 1
text += pageObj.extractText()
# This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
# If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
else:
text = textract.process(filename, method='tesseract', language='eng')
# select relevnt section
# education qulification
textt = re.search(r'EDUCATION\n.*?SKILLS', text, re.DOTALL).group()
edu_qulification = textt[textt.find('\n') + 1:textt.rfind('\n')]
srt1=edu_qulification.lower()
# print(edu_qulification)
str12 = srt1.replace("\n", ". ")
str2 = str12.replace("m.s.", "master")
# print(str2)
syn = synonyms = wordnet.synsets('degree')
syn_set1 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('BACHELOR')
syn_set2 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
syn = synonyms = wordnet.synsets('Master')
syn_set3 = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listone = ['bsc','be', 'btech']
listtwo =['m.s.']
mergedlist = listone + syn_set1 + syn_set2 + syn_set3 + listtwo
# print(mergedlist)
for i in mergedlist:
sent_part=sentence_finder(str2,i)
# print(sent_part)
if not sent_part:
pass
else:
Digree = sentence_finder(str2, i)
synn = synonyms = wordnet.synsets('university')
syn_seta = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('institute')
syn_setb= list(chain.from_iterable([word.lemma_names() for word in synonyms]))
synn = synonyms = wordnet.synsets('college')
syn_setc = list(chain.from_iterable([word.lemma_names() for word in synonyms]))
listthree=['center']
mergedlistt = listthree + syn_seta + syn_setb + syn_setc
# print(mergedlistt)
for j in mergedlistt:
sent_partt = sentence_finder(str2, j)
# print(sent_partt)
if not sent_partt:
pass
else:
University = sentence_finder(str2, j)
# Digree = sentence_finder(str2, 'BACHELOR')
# University = sentence_finder(str2, 'UNIVERSITY')
print(Digree)
print(University)
print(".................................................................")
# print(University)
except:
print("No Education Qualification mentioned")
I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
I am (attempting) to write a program that searches through a hex file for instances of a hex string between two values, eg. Between D4135B and D414AC, incrementing between the first value until the second is reached- D4135B, D4135C, D4135D etc etc.
I have managed to get it to increment etc, but it’s the search part I am having trouble with.
This is the code I have so far, it's been cobbled together from other places and I need to make it somehow output all search hits into the output file (file_out)
I have exceeded the limit of my Python understanding and I'm sure there's probably a much easier way of doing this. I would be very grateful for any help.
def search_process(hx): # searching for two binary strings
global FLAG
while threeByteHexPlusOne != threeByteHex2: #Keep incrementing until second value reached
If Flag:
if hx.find(threeByteHex2) != -1:
FLAG = False #If threeByteHex = ThreeByteHexPlusOne, end search
Print (“Reached the end of the search”,hx.find(threeByteHexPlusOne))
Else:
If hx.find(threeByteHexPlusOne) != -1:
FLAG = True
Return -1 #If no results found
if __name__ == '__main__':
try:
file_in = open(FILE_IN, "r") #opening input file
file_out = open(FILE_OUT, 'w') #opening output file
hx_read = file_in.read #read from input file
tmp = ''
found = ''
while hx_read: #reading from file till file is empty
hx_read = tmp + hx_read
pos = search_process(hx_read)
while pos != -1:
hex_read = hx_read[pos:]
if FLAG:
found = found + hx_read
pos = search_process(hx_read)
tmp = bytes_read[]
hx_read = file_in.read
file_out.write(found) #writing to output file
except IOError:
print('FILE NOT FOUND!!! Check your filename or directory/PATH')
Here's a program that looks through a hex string from a file 3 bytes at a time and if the 3-byte hex string is between the given hex bounds, it writes it to another file. It makes use of generators to make getting the bytes from the hex string a little cleaner.
import base64
import sys
_usage_string = 'Usage: python {} <input_file> <output_file>'.format(sys.argv[0])
def _to_base_10_int(value):
return int(value, 16)
def get_bytes(hex_str):
# Two characters equals one byte
for i in range(0, len(hex_str), 2):
yield hex_str[i:i+2]
def get_three_byte_hexes(hex_str):
bytes = get_bytes(hex_str)
while True:
try:
three_byte_hex = next(bytes) + next(bytes) + next(bytes)
except StopIteration:
break
yield three_byte_hex
def find_hexes_in_range(hex_str, lower_bound_hex, upper_bound_hex):
lower_bound = _to_base_10_int(lower_bound_hex)
upper_bound = _to_base_10_int(upper_bound_hex)
found = []
for three_byte_hex in get_three_byte_hexes(hex_str):
hex_value = _to_base_10_int(three_byte_hex)
if lower_bound <= hex_value < upper_bound:
found.append(three_byte_hex)
return found
if __name__ == "__main__":
try:
assert(len(sys.argv) == 3)
except AssertionError:
print _usage_string
sys.exit(2)
file_contents = open(sys.argv[1], 'rb').read()
hex_str = base64.decodestring(file_contents).encode('hex')
found = find_hexes_in_range(hex_str, 'D4135B', 'D414AC')
print('Found:')
print(found)
if found:
with open(sys.argv[2], 'wb') as fout:
for _hex in found:
fout.write(_hex)
Check out some more info on generators here