Extracting Data from Multiple PDFs' - python

I am trying to extract data from PDF document and have regarding that - I was able to get the code working for one single PDF. However, is there a way I can point the code to a folder with multiple PDF's and get the extract out in CSV? I am a complete beginner in Python, so any help will be appreciated. Below is the current code that I have.
import pdfplumber
import pandas as pd
file = 'Test Slip.pdf'
lines = []
with pdfplumber.open(file) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text.split('\n'):
lines.append(line)
print(line)
df = pd.DataFrame(lines)
df.to_csv('test.csv')

One possible option would be to use os.listdir and only read files that end in .pdf:
import os
folder_with_pdfs = '/path/to/folder'
for pdf_file in os.listdir(folder_with_pdfs):
if pdf_file.endswith('.pdf'):
pdf_file_path = os.path.join(folder_with_pdfs, pdf_file)
# do pdf reading with opening pdf_file_path

I am not sure why you aim to write lines to a dataframe as rows but this should be what you need:
import pdfplumber
import pandas as pd
import os
def extract_pdf(pdf_path):
linesOfFile = []
with pdfplumber.open(pdf_path) as pdf:
for pdf_page in pdf.pages:
single_page_text = pdf_page.extract_text()
for linesOfFile in single_page_text.split('\n'):
linesOfFile.append(line)
#print(linesOfFile)
return linesOfFile
folder_with_pdfs = 'folder_path'
linesOfFiles = []
for pdf_file in os.listdir(folder_with_pdfs):
if pdf_file.endswith('.pdf'):
pdf_file_path = os.path.join(folder_with_pdfs, pdf_file)
linesOfFile = extract_pdf(pdf_file_path)
linesOfFiles.append(linesOfFile)
df = pd.DataFrame(linesOfFiles)
df.to_csv('test.csv')

Related

How to download a csv file in Python

I am trying to download a csv file from the url
https://qubeshub.org/publications/1220/supportingdocs/1#supportingdocs .
the file is Elephant Morphometrics and Tusk Size-originaldata-3861.csv
I have tried using using pd.read_csv()
and
import pandas as pd
import io
import requests
url="https://qubeshub.org/publications/1220/supportingdocs/1#supportingdocs/Elephant Morphometrics and Tusk Size-originaldata-3861.csv"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))
Try:
import requests
url = "https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1"
r = requests.get(url)
filename = r.headers["Content-Disposition"].split('"')[1]
with open(filename, "wb") as f_out:
print(f"Downloading {filename}")
f_out.write(r.content)
Prints:
Downloading Elephant Morphometrics and Tusk Size-originaldata-3861.csv
and saves the file.
This should download the file and parse the rows and columns into a csv file
import requests
import csv
url = "https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1"
req=requests.get(url)
rows = req.content.decode('utf-8').split("\r\n")
rows.pop()
csv_local_filename = "test.csv"
with open(csv_local_filename, 'w') as fs:
writer = csv.writer(fs, delimiter = ',')
for row in rows:
entries = row.split(',')
b=writer.writerow(entries)
You'll likely want to convert those columns into the desired types before you start working with them. The example code above leaves everything as a string.
After I run the above code I see:
>tail test.csv
2005-13,88,m,32.5,290,162.3,40
2005-13,51,m,37.5,270,113.2,40
2005-13,86,m,37.5,310,175.3,38
and
>head test.csv
Years of sample collection,Elephant ID,Sex,Estimated Age (years),shoulder Height in cm,Tusk Length in cm,Tusk Circumference in cm
1966-68,12,f,0.08,102,,
1966-68,34,f,0.08,89,,
1966-68,162,f,0.083,89,,
1966-68,292,f,0.083,92,,
In Firefox after downloading file in browser you can check link to this file and it shows
https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1
and this link you should use in code
import pandas as pd
df = pd.read_csv('https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1')
print(df)

'Nonetype object is not itreable' when trying to extract from PDF

I am trying to extract data from a PDF, but I keep getting a type error because my object is not iterable (on the statement for line in text: but I don't understand why 'text' has no value, just above that I create the text object using text = page.extract.text() and then I want to iterate through each line of the text to find matches to my regexes.
I'm afraid that my statement for line in text: is the problem; perhaps using 'line' isn't appropriate, but I don't know what else to do.
My code is below, thanks for looking!
import requests
import pdfplumber
import pandas as pd
import re
from collections import namedtuple
Line = namedtuple('Line', 'gbloc_name contact_type email')
gbloc_re = re.compile(r'^(?:a\.\s[A-Z]{5}\:\s[A-Z]{4})')
line_re = re.compile(r'^[^#\s]+#[^#\s]\.[^#\s]+$')
file = 'sampleReport.pdf'
lines=[]
with pdfplumber.open(file) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text:
gbloc = gbloc_re.search(line)
if gbloc:
gbloc_name = gbloc
elif line.startswith('Outbound'):
contact_type = 'Outbound'
elif line.startswith('Tracing'):
contact_type = 'Tracing'
elif line.startswith('Customer'):
contact_type = 'Customer Service'
elif line.startswith('QA'):
contact_type = 'Quality Assurance'
elif line.startswith('NTS'):
contact_type = 'NTS'
elif line.startswith('Inbound'):
contact_type = 'Inbound'
elif line_re.search(line):
items = line.split()
lines.append(Line(gbloc_name, contact_type, *items))
Try setting the loop directly equal to the page.extract_text() value. Like this:
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
for line in page.extract_text():
I used lib PyPDF2 to extract text from PDF. Here, i made a simple source code.
It will extract the content by page.
import PyPDF2
with open('example.pdf', 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
for i in range(0, pdfReader.numPages):
print("Page: ", i)
pageObj = pdfReader.getPage(i)
print(pageObj.extractText())
Image result:
Please check and respond to me if you have any issue.

I have converted a pdf file to csv using anaconda python3 But the converted csv file is not in a readable form how to make it readable?

# importing required modules
import PyPDF2
# creating a pdf file object
pdfFileObj = open(path, 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
# creating a page object
pageObj = pdfReader.getPage(0)
# extracting text from page
print(pageObj.extractText())
df = pd.DataFrame(pdfFileObj)
print (df)
df.to_csv('output.csv')
I have converted a pdf file to csv using anaconda python 3. But the converted csv file is not in a readable form. how to make that csv in readable format?
I tested your method and I couldn't find a way to correct the csv ouput. I useally do it this way:
import csv
import os
from miner_text_generator import extract_text_by_page
def export_as_csv(pdf_path, csv_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
counter = 1
with open(csv_path, 'w') as csv_file:
writer = csv.writer(csv_file)
for page in extract_text_by_page(pdf_path):
text = page[0:100]
words = text.split()
writer.writerow(words)
if __name__ == '__main__':
pdf_path = '<your path to the file>.pdf'
csv_path = '<path to the output>.csv'
export_as_csv(pdf_path, csv_path)

In Python what is the best way to read a pdf table with no outline?

I am trying to read data from a table in a pdf into a pandas dataframe. I am able to do so using tabula-py when the pdf has outlines around the table, but when I try on the pdf without an outline the script produces an error.
For example, I am looking at the pdfs available from two different urls. I have downloaded the pdfs from the urls and saved them as 'JSE Opts.pdf' and 'JSE Divs.pdf' respectively.
import requests
import pandas as pd
url='https://clientportal.jse.co.za/JSE%20Equity%20Derivatives/Dividends/ED_DividendsReport.pdf'
response = requests.get(url)
fname = 'JSE Divs.pdf'
f= open(fname, 'wb')
f.write(response.content)
f.close()
url='https://clientportal.jse.co.za/JSE%20Equity%20Derivatives/Options%20Daily%20Traded%20Report/ED_OptionsDailyTradedReport.pdf'
response = requests.get(url)
fname = 'JSE Opts.pdf'
f= open(fname, 'wb')
f.write(response.content)
f.close()
I am able to read the 'JSE Opts.pdf' into a pandas dataframe using the code:
import tabula as tb
pdf = './JSE Opts.pdf'
data = tb.read_pdf(pdf,pages = 1)
data = data[0]
print(data)
When I try to do the same for 'JSE Divs.pdf', I get errors and tabula-py is only able to read the header:
pdf = './JSE Divs.pdf'
data = tb.read_pdf(pdf,pages = 1)
data = data[0]
print(data)
I suspect that this is because there are no lines around the table. If that is the case, what is the best way to go about reading the data from 'JSE Divs.pdf' into pandas?
I was able to read the data into a string using pdfplumber, save the string as a CSV file (after cleaning the data to suit my needs) and then import into pandas.
import pdfplumber
pdf = pdfplumber.open("./JSE Divs.pdf")
text = ''
i = 0
while True:
try:
text += pdf.pages[i].extract_text() + '\n'
i = i+1
except IndexError:
break
for replace_s in [' DN',' CA1',' ANY',' CSH',' PHY',' QUANTO']:
text = text.replace(replace_s,'')
while True:
try:
idx = text.index('EXO')
replace_s =text[idx-1:idx+8]
text = text.replace(replace_s,'')
except ValueError:
break
cols ='EXPIRY_s,USYM,EXPIRY,EX_DATE,CUM_PV_DIVS,CUM_DIVS,ISIN,INSTR_ID\n'
text = text[text.index('Div\n')+4:]
text = cols + text
text = text.replace(' ',',')
f = open('divs.csv','w')
f.write(text)
f.close()

How to write csv file in html?

I have read file of csv but I have a problem that how to read CSV file and save it in table.html?
import csv
html_about = ''
names = []
with open('filo.csv') as data_file:
csv_data = csv.reader(data_file)
for line in csv_data:
names.append(f'{line[0]}')
html_output = '\n<ul>'
for name in names:
html_output += f'\n\t<li>{name}</li>'
html_output += '\n</ul>'
from prettytable import PrettyTable
x = PrettyTable(line[0])
html_code = x.get_html_string()
html_file = open('table.html','w')
html_file = html_file.write(html_code)
I suggest you use pandas library,
it has pd.read_csv, and also pd.to_html
usage should look like this, let me know if this works for you:
import pandas as pd
df = pd.read_csv('filo.csv')
with open('table.html', 'w') as html_file:
df.to_html(html_file)

Categories