I need a little help figuring out why this isn't working as expected.
the following code, opens a PDF file, extracts the text and should save the individual text files according to the name of the PDF file, however its not producing any output. please help. code is as follows:
import PyPDF2
import os
import glob
directory = 'C:/LIVE/2017/'
fileStructure = glob.glob("C:/LIVE/2017/*")
names = [os.path.basename(x) for x in glob.glob('C:/LIVE/2017/*')]
for file in os.listdir(directory):
with open(os.path.join(directory,file), 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
pageObj = pdfReader.getPage(0)
number_of_pages = pdfReader.getNumPages()
for page_number in range(number_of_pages):
page = pdfReader.getPage(page_number)
page_content = page.extractText().encode('utf-8')
getFileName = os.path.basename(pdfFileObj.name)
bcn = getFileName.rsplit(' ', 1)[-1]
bcNum = os.path.splitext(os.path.basename(bcn))[0]
text_file = open(bcNum, "w")
text_file.write(page_content)
text_file.close()
Does it extract strings from the PDF to begin with? I've tried using PyPDF2 before and noticed it often has trouble getting text from PDFs if they're not formatted exactly right. I've had much more success using the module Tika.
from tika import parser
def read_pdf(pdf):
raw = parser.from_file(pdf)
return raw['content']
text_list = list()
for file in os.listdir(directory):
raw_content = read_pdf(pdf)
text_list.append(raw_content)
Related
I am new to python, only one script behind me for searching strings in pdfs. Now, I would like to build script which will give me results in new CSV/xlsx file where I will have first lines and their page numbers of given pdf file. For now I have code below for printing whole page:
from PyPDF2 import PdfFileReader
pdf_document = "example.pdf"
with open(pdf_document, "rb") as filehandle:
pdf = PdfFileReader(filehandle)
info = pdf.getDocumentInfo()
pages = pdf.getNumPages()
print (info)
print ("number of pages: %i" % pages)
page1 = pdf.getPage(0)
print(page1)
print(page1.extractText())
You can read pdf file page by page, split by '\n' (if that is the character that splits lines), then use the CSV package to write into a CSV file. A script like below. Just to mention that it if the PDF contains images this code will not be able to extract text. You need an OCR module to convert images to text first.
from PyPDF2 import PdfFileReader
import csv
pdf_document = "test.pdf"
with open(pdf_document, "rb") as filehandle:
pdf = PdfFileReader(filehandle)
with open('result.csv','w') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['page numebr','firts line'])
for i in range(0, pdf.getNumPages()):
content= pdf.getPage(i).extractText().split('\n')
print(content[0]) # prints first line
print(i+1) # prints page number
print('-------------')
csv_writer.writerow([i+1,content[0]])
I have to convert whole pdf to text. i have seen at many places converting pdf to text but particular page.
from PyPDF2 import PdfFileReader
import os
def text_extractor(path):
with open(os.path.join(path,file), 'rb') as f:
pdf = PdfFileReader(f)
###Here i can specify page but i need to convert whole pdf without specifying pages###
page = pdf.getPage(0)
text = page.extractText()
print(text)
if __name__ == '__main__':
path="C:\\Users\\AAAA\\Desktop\\BB"
for file in os.listdir(path):
if not file.endswith(".pdf"):
continue
text_extractor(path)
How to convert whole pdf file to text without using getpage()??
You may want to use textract as this answer recommends to get the full document if all you want is the text.
If you want to use PyPDF2 then you can first get the number of pages then iterate over each page such as:
from PyPDF2 import PdfFileReader
import os
def text_extractor(path):
with open(os.path.join(path,file), 'rb') as f:
pdf = PdfFileReader(f)
###Here i can specify page but i need to convert whole pdf without specifying pages###
text = ""
for page_num in range(pdf.getNumPages()):
page = pdf.getPage(page_num)
text += page.extractText()
print(text)
if __name__ == '__main__':
path="C:\\Users\\AAAA\\Desktop\\BB"
for file in os.listdir(path):
if not file.endswith(".pdf"):
continue
text_extractor(path)
Though you may want to remember which page the text came from in which case you could use a list:
page_text = []
for page_num in range(pdf.getNumPages()): # For each page
page = pdf.getPage(page_num) # Get that page's reference
page_text.append(page.extractText()) # Add that page to our array
for page in page_text:
print(page) # print each page
You could use tika to accomplish this task, but the output needs a little cleaning.
from tika import parser
parse_entire_pdf = parser.from_file('mypdf.pdf', xmlContent=True)
parse_entire_pdf = parse_entire_pdf['content']
print (parse_entire_pdf)
This answer uses PyPDF2 and encode('utf-8') to keep the output per page together.
from PyPDF2 import PdfFileReader
def pdf_text_extractor(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
# Get total pdf page number.
totalPageNumber = pdf.numPages
currentPageNumber = 0
while (currentPageNumber < totalPageNumber):
page = pdf.getPage(currentPageNumber)
text = page.extractText()
# The encoding put each page on a single line.
# type is <class 'bytes'>
print(text.encode('utf-8'))
#################################
# This outputs the text to a list,
# but it doesn't keep paragraphs
# together
#################################
# output = text.encode('utf-8')
# split = str(output, 'utf-8').split('\n')
# print (split)
#################################
# Process next page.
currentPageNumber += 1
path = 'mypdf.pdf'
pdf_text_extractor(path)
Try pdfreader. You can extract either plain text or decoded text containing "pdf markdown":
from pdfreader import SimplePDFViewer, PageDoesNotExist
fd = open(you_pdf_file_name, "rb")
viewer = SimplePDFViewer(fd)
plain_text = ""
pdf_markdown = ""
try:
while True:
viewer.render()
pdf_markdown += viewer.canvas.text_content
plain_text += "".join(viewer.canvas.strings)
viewer.next()
except PageDoesNotExist:
pass
PDF is a page-oriented format & therefore you'll need to deal with the concept of pages.
What makes it perhaps even more difficult, you're not guaranteed that the text excerpts you're able to extract are extracted in the same order as they are presented on the page: PDF allows one to say "put this text within a 4x3 box situated 1" from the top, with a 1" left margin.", and then I can put the next set of text somewhere else on the same page.
Your extractText() function simply gets the extracted text blocks in document order, not presentation order.
Tables are notoriously difficult to extract in a common, meaningful way... You see them as tables, PDF sees them as text blocks placed on the page with little or no relationship.
Still, getPage() and extractText() are good starting points & if you have simply formatted pages, they may work fine.
I found out a very simple way to do this.
You have to follow this steps:
Install PyPDF2 :To do this step if you use Anaconda, search for Anaconda Prompt and digit the following command, you need administrator permission to do this.
pip install PyPDF2
If you're not using Anaconda you have to install pip and put its path
to your cmd or terminal.
Python Code: This following code shows how to convert a pdf file very easily:
import PyPDF2
with open("pdf file path here",'rb') as file_obj:
pdf_reader = PyPDF2.PdfFileReader(file_obj)
raw = pdf_reader.getPage(0).extractText()
print(raw)
I just used pdftotext module to get this done easily.
import pdftotext
# Load your PDF
with open("test.pdf", "rb") as f:
pdf = pdftotext.PDF(f)
# creating a text file after iterating through all pages in the pdf
file = open("test.txt", "w")
for page in pdf:
file.write(page)
file.close()
Link: https://github.com/manojitballav/pdf-text
I am wondering if there is a way in python (tool or function etc.) to convert my pdf file to doc or docx?
I am aware of online converters but I need this in Python code.
If you have pdf with lot of pages..below code will work:
import PyPDF2
path="C:\\ .... "
text=""
pdf_file = open(path, 'rb')
text =""
read_pdf = PyPDF2.PdfFileReader(pdf_file)
c = read_pdf.numPages
for i in range(c):
page = read_pdf.getPage(i)
text+=(page.extractText())
If you happen to have MS Word, there is a really simple way to do this using COM.
Here is a script I wrote that can convert pdf to docx by calling the Word application.
import glob
import win32com.client
import os
word = win32com.client.Dispatch("Word.Application")
word.visible = 0
pdfs_path = "" # folder where the .pdf files are stored
for i, doc in enumerate(glob.iglob(pdfs_path+"*.pdf")):
print(doc)
filename = doc.split('\\')[-1]
in_file = os.path.abspath(doc)
print(in_file)
wb = word.Documents.Open(in_file)
out_file = os.path.abspath(reqs_path +filename[0:-4]+ ".docx".format(i))
print("outfile\n",out_file)
wb.SaveAs2(out_file, FileFormat=16) # file format for docx
print("success...")
wb.Close()
word.Quit()
I used the following code to read the pdf file, but it does not read it. What could possibly be the reason?
from PyPDF2 import PdfFileReader
reader = PdfFileReader("example.pdf")
contents = reader.pages[0].extractText().split("\n")
print(contents)
The output is [u''] instead of reading the content.
import re
from PyPDF2 import PdfFileReader
reader = PdfFileReader("example.pdf")
for page in reader.pages:
text = page.extractText()
text_lower = text.lower()
for line in text_lower:
if re.search("abc", line):
print(line)
I use it to iterate page by page of pdf and search for key terms in it and process further.
May be this can help you to read PDF.
import pyPdf
def getPDFContent(path):
content = ""
pages = 10
p = file(path, "rb")
pdf_content = pyPdf.PdfFileReader(p)
for i in range(0, pages):
content += pdf_content.getPage(i).extractText() + "\n"
content = " ".join(content.replace(u"\xa0", " ").strip().split())
return content
I think you need to specify the disc name, it's missing in your directory. For example "D:/Users/Rahul/Desktop/Dfiles/106_2015_34-76357.pdf". I tried and I can read without any problem.
Or if you want to find the file path using the os module which you didn't really associate with your directory, you can try the following:
from PyPDF2 import PdfFileReader
import os
def find(name, path):
for root, dirs, files in os.walk(path):
if name in files:
return os.path.join(root, name)
directory = find('106_2015_34-76357.pdf', 'D:/Users/Rahul/Desktop/Dfiles/')
f = open(directory, 'rb')
reader = PdfFileReader(f)
contents = reader.getPage(0).extractText().split('\n')
f.close()
print(contents)
The find function can be found in Nadia Alramli's answer here Find a file in python
To Read the files from Multiple Folders in a directory, below code can be used-
This Example is for reading pdf files:
import os
from tika import parser
path = "/usr/local/" # path directory
directory=os.path.join(path)
for r,d,f in os.walk(directory): #going through subdirectories
for file in f:
if ".pdf" in file: # reading only PDF files
file_join = os.path.join(r, file) #getting full path
file_data = parser.from_file(file_join) # parsing the PDF file
text = file_data['content'] # read the content
print(text) #print the content
def getTextPDF(pdfFileName,password=''):
import PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
from nltk import sent_tokenize
""" Extract Text from pdf """
pdf_file=open(pdfFileName,'rb')
read_pdf=PyPDF2.PdfFileReader(pdf_file)
if password !='':
read_pdf.decrypt(password)
text=[]
for i in range(0,read_pdf.getNumPages()):
text.append(read_pdf.getPage(i).extractText())
text = '\n'.join (text).replace("\n",'')
text = sent_tokenize(text)
return text
The issue was one of two things: (1) The text was not on page one - hence a user error. (2) PyPDF2 failed to extract the text - hence a bug in PyPDF2.
Sadly, the second one still happens for some PDFs.
Hello Rahul Pipalia,
If not install PyPDF2 in your python so first install PyPDF2 after use this module.
Installation Steps for Ubuntu (Install python-pypdf)
First, open terminal
After type sudo apt-get install python-pypdf
Your Probelm Solution
Try this below code,
# Import Library
import PyPDF2
# Which you want to read file so give file name with ".pdf" extension
pdf_file = open('Your_Pdf_File_Name.pdf')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
#Give page number of the pdf file (How many page in pdf file).
# #param Page_Nuber_of_the_PDF_file: Give page number here i.e 1
page = read_pdf.getPage(Page_Nuber_of_the_PDF_file)
page_content = page.extractText()
# Display content of the pdf
print page_content
Download the PDF from below link and try this code,
https://www.dropbox.com/s/4qad66r2361hvmu/sample.pdf?dl=1
I hope my answer is helpful.
If any query so comments, please.
I have a long list of .json files that I need to download to my computer. I want to download them as .json files (so no parsing or anything like that at this point).
I have some code that works for small files, but it is pretty buggy. Also it doesn't handle multiple links well.
Appreciate any advice to fix up this code:
import os
filename = 'test.json'
path = "C:/Users//Master"
fullpath = os.path.join(path, filename)
import urllib2
url = 'https://www.premierlife.com/secure/index.json'
response = urllib2.urlopen(url)
webContent = response.read()
f = open(fullpath, 'w')
f.write(webContent)
f.close
It's creating a blank file because the f.close at the end should be f.close().
I took your code and made a little function and then called it on a little loop to go through a .txt file with the list of urls called "list_of_urls.txt" having 1 url per line (you can change the delimiter in the split function if you want to format it differently).
def save_json(url):
import os
filename = url.replace('/','').replace(':','')
# this replaces / and : in urls
path = "C:/Users/Master"
fullpath = os.path.join(path, filename)
import urllib2
response = urllib2.urlopen(url)
webContent = response.read()
f = open(fullpath, 'w')
f.write(webContent)
f.close()
And then the loop:
f = open('list_of_urls.txt')
p = f.read()
url_list = p.split('\n') #here's where \n is the line break delimiter that can be changed
for url in url_list:
save_json(url)