from PyPDF2 import PdfFileReader, PdfFileWriter
import os as os
listdir = os.listdir(r"C:\Users\Max12\Desktop\xml\pdfminer\UiPath\attachments\75090058\Status\Verwerking")
for file in listdir:
if file.endswith(".pdf"):
pdf_file_path = 'Unknown.pdf'
file_base_name = file.replace('.pdf', '')
pdf = PdfFileReader(file)
pages = [0, 1] # page 1, 2
pdfWriter = PdfFileWriter()
else:
pass
for page_num in pages:
pdfWriter.addPage(pdf.getPage(page_num))
with open('{0}_subset.pdf'.format(file_base_name), 'wb') as f:
pdfWriter.write(f)
f.close()
Hi all,
I want to update pdf files in a directory to having two pages max. So updating the file in case they have two pages or more to a max of two pages. I've written the above stated code.
However, my IDE is giving the following error:
Traceback (most recent call last):
File "file.py", line 16, in <module>
pdfWriter.addPage(pdf.getPage(page_num))
File "C:\Python38\lib\site-packages\PyPDF2\pdf.py", line 1177, in getPage
return self.flattenedPages[pageNumber]
IndexError: list index out of range
I don't know what I'm doing wrong.. Can any of you guys help me?
The code will fail if a pdf file consists of only one page. Since pdf.getNumPages() returns the number of pages in the file you can replace pages = [0, 1] with pages = range(min(2, pdf.getNumPages())) to fix this.
Additionally, you iterate over pdf files in the directory, but then you process only the last file which is not what you want to accomplish. The second for loop and the with statement should be inside the if block.
Overall the following should work:
from PyPDF2 import PdfFileReader, PdfFileWriter
import os as os
istdir = os.listdir(r"C:\Users\Max12\Desktop\xml\pdfminer\UiPath\attachments\75090058\Status\Verwerking")
for file in listdir:
if file.endswith(".pdf"):
file_base_name = file.replace('.pdf', '')
pdf = PdfFileReader(file)
pages = range(min(2, pdf.getNumPages()))
pdfWriter = PdfFileWriter()
for page_num in pages:
pdfWriter.addPage(pdf.getPage(page_num))
with open('{0}_subset.pdf'.format(file_base_name), 'wb') as f:
pdfWriter.write(f)
f.close()
Related
I am new to python, only one script behind me for searching strings in pdfs. Now, I would like to build script which will give me results in new CSV/xlsx file where I will have first lines and their page numbers of given pdf file. For now I have code below for printing whole page:
from PyPDF2 import PdfFileReader
pdf_document = "example.pdf"
with open(pdf_document, "rb") as filehandle:
pdf = PdfFileReader(filehandle)
info = pdf.getDocumentInfo()
pages = pdf.getNumPages()
print (info)
print ("number of pages: %i" % pages)
page1 = pdf.getPage(0)
print(page1)
print(page1.extractText())
You can read pdf file page by page, split by '\n' (if that is the character that splits lines), then use the CSV package to write into a CSV file. A script like below. Just to mention that it if the PDF contains images this code will not be able to extract text. You need an OCR module to convert images to text first.
from PyPDF2 import PdfFileReader
import csv
pdf_document = "test.pdf"
with open(pdf_document, "rb") as filehandle:
pdf = PdfFileReader(filehandle)
with open('result.csv','w') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['page numebr','firts line'])
for i in range(0, pdf.getNumPages()):
content= pdf.getPage(i).extractText().split('\n')
print(content[0]) # prints first line
print(i+1) # prints page number
print('-------------')
csv_writer.writerow([i+1,content[0]])
I need a little help figuring out why this isn't working as expected.
the following code, opens a PDF file, extracts the text and should save the individual text files according to the name of the PDF file, however its not producing any output. please help. code is as follows:
import PyPDF2
import os
import glob
directory = 'C:/LIVE/2017/'
fileStructure = glob.glob("C:/LIVE/2017/*")
names = [os.path.basename(x) for x in glob.glob('C:/LIVE/2017/*')]
for file in os.listdir(directory):
with open(os.path.join(directory,file), 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
pageObj = pdfReader.getPage(0)
number_of_pages = pdfReader.getNumPages()
for page_number in range(number_of_pages):
page = pdfReader.getPage(page_number)
page_content = page.extractText().encode('utf-8')
getFileName = os.path.basename(pdfFileObj.name)
bcn = getFileName.rsplit(' ', 1)[-1]
bcNum = os.path.splitext(os.path.basename(bcn))[0]
text_file = open(bcNum, "w")
text_file.write(page_content)
text_file.close()
Does it extract strings from the PDF to begin with? I've tried using PyPDF2 before and noticed it often has trouble getting text from PDFs if they're not formatted exactly right. I've had much more success using the module Tika.
from tika import parser
def read_pdf(pdf):
raw = parser.from_file(pdf)
return raw['content']
text_list = list()
for file in os.listdir(directory):
raw_content = read_pdf(pdf)
text_list.append(raw_content)
I'm not very experienced in programming. What I'm trying to do is to randomly shuffle the pages of a pdf and output it to another pdf.
Searching online I found the following two solutions (source 1, source 2):
#!/usr/bin/env python2
import random, sys
from PyPDF2 import PdfFileWriter, PdfFileReader
input = PdfFileReader(sys.stdin)
output = PdfFileWriter()
pages = range(input.getNumPages())
random.shuffle(pages)
for i in pages:
output.addPage(input.getPage(i))
output.write(sys.stdout)
And this one:
#!/usr/bin/python
import sys
import random
from pyPdf import PdfFileWriter, PdfFileReader
# read input pdf and instantiate output pdf
output = PdfFileWriter()
input1 = PdfFileReader(file(sys.argv[1],"rb"))
# construct and shuffle page number list
pages = list(range(input1.getNumPages()))
random.shuffle(pages)
# display new sequence
print 'Reordering pages according to sequence:'
print pages
# add the new sequence of pages to output pdf
for page in pages:
output.addPage(input1.getPage(page))
# write the output pdf to file
outputStream = file(sys.argv[1]+'-mixed.pdf','wb')
output.write(outputStream)
outputStream.close()
I tried both (and both with PyPDF2 and pyPdf) and both indeed create a new pdf file, but this file is simply empty (and has 0KB) (when I enter, let's say "shuffle.py new.pdf").
I'm using PyCharm and one problem I encounter (and not really understand) is that it says: "Cannot find reference 'PdfFileWriter'".
PyCharm tells me that it cannot find the reference
I would appreciate any help understanding what I'm doing wrong :)
EDIT:
As suggested by Tom Dalton, I'm posting what happens when I run the first one:
C:\Users\Anwender\AppData\Local\Temp\shuffle.py\venv\Scripts\python.exe "E:/Shuffle PDF/shuffle.py"
PdfReadWarning: PdfFileReader stream/file object is not in binary mode. It may not be read correctly. [pdf.py:1079]
Traceback (most recent call last):
File "E:/Shuffle PDF/shuffle.py", line 5, in <module>
input = PdfFileReader(sys.stdin)
File "C:\Python27\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Python27\lib\site-packages\PyPDF2\pdf.py", line 1689, in read
stream.seek(-1, 2)
IOError: [Errno 22] Invalid argument
Process finished with exit code 1
From the comments I infer that the fact that a new PDF is created is only due to me typing "shuffle.py newfile.pdf" into the terminal :D
EDIT 2: I now figured it out; this now works:
from PyPDF2 import PdfFileReader, PdfFileWriter
import random, sys
output = PdfFileWriter()
input = PdfFileReader(file("test.pdf", "rb"))
pages = range(input.getNumPages())
random.shuffle(pages)
for i in pages:
output.addPage(input.getPage(i))
outputStream = file(r"output2.pdf", "wb")
output.write(outputStream)
outputStream.close()
Hello Stackoverflow community!
I'm trying to build a Python program that will walk a directory (and all sub-directories) and do a accumulated word count total on all .html, .txt, and .pdf files. When reading a .pdf file it requires a little something extra (PdfFileReader) to parse the file. When parsing a .pdf files I'm getting the following error and the program stops:
AttributeError: 'PdfFileReader' object has no attribute 'startswith'
When not parsing .pdf files the problem completely successfully.
CODE
#!/usr/bin/python
import re
import os
import sys
import os.path
import fnmatch
import collections
from PyPDF2 import PdfFileReader
ignore = [<lots of words>]
def extract(file_path, counter):
words = re.findall('\w+', open(file_path).read().lower())
counter.update([x for x in words if x not in ignore and len(x) > 2])
def search(path):
print path
counter = collections.Counter()
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for file in files:
if file.lower().endswith(('.html', '.txt')):
print file
extract(os.path.join(root, file), counter)
if file.lower().endswith(('.pdf')):
file_path = os.path.abspath(os.path.join(root, file))
print file_path
with open(file_path, 'rb') as f:
reader = PdfFileReader(f)
extract(os.path.join(root, reader), counter)
contents = reader.getPage(0).extractText().split('\n')
extract(os.path.join(root, contents), counter)
pass
else:
extract(path, counter)
print(counter.most_common(50))
search(sys.argv[1])
The full error
Traceback (most recent call last):File line 50, in <module> search(sys.argv[1])
File line 36, in search extract(os.path.join(root, reader), counter)
File line 68, in join if b.startswith('/'):
AttributeError: 'PdfFileReader' object has no attribute 'startswith'
It appears there is a failure when calling the extract function with the .pdf file. Any help/guidance would be greatly appreciated!
Expected Results (works w/out .pdf files)
[('cyber', 5101), ('2016', 5095), ('date', 4912), ('threat', 4343)]
The problems is that this line
reader = PdfFileReader(f)
returns an object of type PdfFileReader. You're then passing this object to the extract() function which is expecting a file path and not a PdfFileReader object.
Suggestion would be to move the PDF related processing that you currently have in the search() function to the extract function() instead. Then, in the extract function, you would check to see if it is a PDF file and then act accordingly. So, something like this:
def extract(file_path, counter):
if file_path.lower().endswith(('.pdf')):
reader = PdfFileReader(file)
contents = reader.getPage(0).extractText().split('\n')
counter.update([x for x in contents if x not in ignore and len(x) > 2])
elif file_path.lower().endswith(('.html', '.txt')):
words = re.findall('\w+', open(file_path).read().lower())
counter.update([x for x in words if x not in ignore and len(x) > 2])
else:
## some other file type...
Haven't tested the code snippet above but hopefully you should get the idea.
I used the following code to read the pdf file, but it does not read it. What could possibly be the reason?
from PyPDF2 import PdfFileReader
reader = PdfFileReader("example.pdf")
contents = reader.pages[0].extractText().split("\n")
print(contents)
The output is [u''] instead of reading the content.
import re
from PyPDF2 import PdfFileReader
reader = PdfFileReader("example.pdf")
for page in reader.pages:
text = page.extractText()
text_lower = text.lower()
for line in text_lower:
if re.search("abc", line):
print(line)
I use it to iterate page by page of pdf and search for key terms in it and process further.
May be this can help you to read PDF.
import pyPdf
def getPDFContent(path):
content = ""
pages = 10
p = file(path, "rb")
pdf_content = pyPdf.PdfFileReader(p)
for i in range(0, pages):
content += pdf_content.getPage(i).extractText() + "\n"
content = " ".join(content.replace(u"\xa0", " ").strip().split())
return content
I think you need to specify the disc name, it's missing in your directory. For example "D:/Users/Rahul/Desktop/Dfiles/106_2015_34-76357.pdf". I tried and I can read without any problem.
Or if you want to find the file path using the os module which you didn't really associate with your directory, you can try the following:
from PyPDF2 import PdfFileReader
import os
def find(name, path):
for root, dirs, files in os.walk(path):
if name in files:
return os.path.join(root, name)
directory = find('106_2015_34-76357.pdf', 'D:/Users/Rahul/Desktop/Dfiles/')
f = open(directory, 'rb')
reader = PdfFileReader(f)
contents = reader.getPage(0).extractText().split('\n')
f.close()
print(contents)
The find function can be found in Nadia Alramli's answer here Find a file in python
To Read the files from Multiple Folders in a directory, below code can be used-
This Example is for reading pdf files:
import os
from tika import parser
path = "/usr/local/" # path directory
directory=os.path.join(path)
for r,d,f in os.walk(directory): #going through subdirectories
for file in f:
if ".pdf" in file: # reading only PDF files
file_join = os.path.join(r, file) #getting full path
file_data = parser.from_file(file_join) # parsing the PDF file
text = file_data['content'] # read the content
print(text) #print the content
def getTextPDF(pdfFileName,password=''):
import PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
from nltk import sent_tokenize
""" Extract Text from pdf """
pdf_file=open(pdfFileName,'rb')
read_pdf=PyPDF2.PdfFileReader(pdf_file)
if password !='':
read_pdf.decrypt(password)
text=[]
for i in range(0,read_pdf.getNumPages()):
text.append(read_pdf.getPage(i).extractText())
text = '\n'.join (text).replace("\n",'')
text = sent_tokenize(text)
return text
The issue was one of two things: (1) The text was not on page one - hence a user error. (2) PyPDF2 failed to extract the text - hence a bug in PyPDF2.
Sadly, the second one still happens for some PDFs.
Hello Rahul Pipalia,
If not install PyPDF2 in your python so first install PyPDF2 after use this module.
Installation Steps for Ubuntu (Install python-pypdf)
First, open terminal
After type sudo apt-get install python-pypdf
Your Probelm Solution
Try this below code,
# Import Library
import PyPDF2
# Which you want to read file so give file name with ".pdf" extension
pdf_file = open('Your_Pdf_File_Name.pdf')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
#Give page number of the pdf file (How many page in pdf file).
# #param Page_Nuber_of_the_PDF_file: Give page number here i.e 1
page = read_pdf.getPage(Page_Nuber_of_the_PDF_file)
page_content = page.extractText()
# Display content of the pdf
print page_content
Download the PDF from below link and try this code,
https://www.dropbox.com/s/4qad66r2361hvmu/sample.pdf?dl=1
I hope my answer is helpful.
If any query so comments, please.