word count PDF files when walking directory - python

Hello Stackoverflow community!
I'm trying to build a Python program that will walk a directory (and all sub-directories) and do a accumulated word count total on all .html, .txt, and .pdf files. When reading a .pdf file it requires a little something extra (PdfFileReader) to parse the file. When parsing a .pdf files I'm getting the following error and the program stops:
AttributeError: 'PdfFileReader' object has no attribute 'startswith'
When not parsing .pdf files the problem completely successfully.
CODE
#!/usr/bin/python
import re
import os
import sys
import os.path
import fnmatch
import collections
from PyPDF2 import PdfFileReader
ignore = [<lots of words>]
def extract(file_path, counter):
words = re.findall('\w+', open(file_path).read().lower())
counter.update([x for x in words if x not in ignore and len(x) > 2])
def search(path):
print path
counter = collections.Counter()
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
for file in files:
if file.lower().endswith(('.html', '.txt')):
print file
extract(os.path.join(root, file), counter)
if file.lower().endswith(('.pdf')):
file_path = os.path.abspath(os.path.join(root, file))
print file_path
with open(file_path, 'rb') as f:
reader = PdfFileReader(f)
extract(os.path.join(root, reader), counter)
contents = reader.getPage(0).extractText().split('\n')
extract(os.path.join(root, contents), counter)
pass
else:
extract(path, counter)
print(counter.most_common(50))
search(sys.argv[1])
The full error
Traceback (most recent call last):File line 50, in <module> search(sys.argv[1])
File line 36, in search extract(os.path.join(root, reader), counter)
File line 68, in join if b.startswith('/'):
AttributeError: 'PdfFileReader' object has no attribute 'startswith'
It appears there is a failure when calling the extract function with the .pdf file. Any help/guidance would be greatly appreciated!
Expected Results (works w/out .pdf files)
[('cyber', 5101), ('2016', 5095), ('date', 4912), ('threat', 4343)]

The problems is that this line
reader = PdfFileReader(f)
returns an object of type PdfFileReader. You're then passing this object to the extract() function which is expecting a file path and not a PdfFileReader object.
Suggestion would be to move the PDF related processing that you currently have in the search() function to the extract function() instead. Then, in the extract function, you would check to see if it is a PDF file and then act accordingly. So, something like this:
def extract(file_path, counter):
if file_path.lower().endswith(('.pdf')):
reader = PdfFileReader(file)
contents = reader.getPage(0).extractText().split('\n')
counter.update([x for x in contents if x not in ignore and len(x) > 2])
elif file_path.lower().endswith(('.html', '.txt')):
words = re.findall('\w+', open(file_path).read().lower())
counter.update([x for x in words if x not in ignore and len(x) > 2])
else:
## some other file type...
Haven't tested the code snippet above but hopefully you should get the idea.

Related

Unable to return required strings from XML files

I have created this code to have a user point at a directory and for it to go through the directory looking for .xml files. Once found the program is supposed to search each file looking for strings that are 32 bits in length. This is the only requirement, the content is not important at this time just that it return 32 bit strings.
i have tried using the regex module within Python as below, when run the program iterates over the available files. returns all the file names but the String_recovery function returns only empty lists. I have confirmed that the xml contains 32 bit strings visually.
import os
import re
import tkinter as tk
from tkinter import filedialog
def string_recovery(data):
short_string = re.compile(r"^[a-zA-Z0-9\-._]{32}$")
strings = re.findall(short_string, data)
print(strings)
def xml_search(directory):
xml_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".xml"):
xml_files.append(os.path.join(root, file))
print("The following XML files have been found.")
print(xml_files)
for xml_file in xml_files:
with open(xml_file, "r") as f:
string_recovery(f.read())
def key_finder():
directory = filedialog.askdirectory()
xml_search(directory)
key_finder()
By default, python patterns are not "multiline" thus ^ and $ match the start and end of your text block, not each line. You need to set this flag re.M aka re.MULTILINE:
compare:
import re
text = """
foo
12345678901234567890123456789011
12345678901234567890123456789011
"""
pattern = r"^[a-zA-Z0-9\-._]{32}$"
print(re.findall(pattern, text, re.M)) ## <--- flag
Giving:
[
'12345678901234567890123456789011',
'12345678901234567890123456789011'
]
with:
import re
text = """
foo
12345678901234567890123456789011
12345678901234567890123456789011
"""
pattern = r"^[a-zA-Z0-9\-._]{32}$"
print(re.findall(pattern, text))
Giving:
[]
Maybe you should go over each line:
for xml_file in xml_files:
with open(xml_file, "r") as f:
string_recovery(f.read())
If your string_recovery works properly (try it with a line, I cannot reproduce your example but create a variable line = and put there a line which should be recoverd.
And go over each line instead of the whole file:
for xml_file in xml_files:
with open(xml_file, "r") as f:
for line in f.readliens():
string_recovery(line)

Error while pdf parsing PyPDF2 and textract

I'm trying to build a program that looks for specific words or short phrases in a pdf file. The files load well but I have a problem when searching through the pdf, when the page changes. Here's my code:
import PyPDF2
import glob, os, shutil
import textract
os.chdir(r"C:\Users\Dani\Desktop\patent")
goodfiles=[]
for file in glob.glob("*.pdf"):
pdfFileObj = open(file, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
search_word_main = "isophthalic"
word_main=[]
search_word_sub = ["acid index", "acid value", "acid number", "acidity index","acidity"]
word_sub=[]
for pageNum in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
text = pageObj.extractText().encode('utf-8')
search_text = text.lower().split()
toprange=len(search_text)-1
for len in range (toprange):
if search_word_main in search_text[len].decode("utf-8"):
print(search_word_main)
for key in search_word_sub:
if key in search_text[len].decode("utf-8") + " " + search_text[len+1].decode("utf-8"):
print(key)
on the first page of the pdf, everything works well, but, whenever it moves to the second page, I get this error:
Traceback (most recent call last):
File "test.py", line 19, in <module>
toprange=len(search_text)-1
TypeError: 'int' object is not callable
I don't understand why whenever the page changes, this happens. If I just try to print the toprange variable rather than adding it to the loop, there's no problem and I get the values for toprange. There seems to be a problem with the for loop but I don't seem to find where. Could you help me solve this?
Thanks in advance.

Looping through Base64 txt files for bulk conversion to images?

I have a large number of txt files that contain the base64 encoding for image files. Each txt file has a single encoding line starting with "data:image/jpeg;base64,/9j/.........". I got the following to work as far as saving the image:
import base64
import os
import fnmatch
os.chdir(r'D:\Users\dubs\slidesets'):)
with open('data.image.jpeg.0bac61939da0c.txt', 'r') as file:
str = file.read().replace('data:image/jpeg;base64,', '')
print str
picname = open("data.image.jpeg.0bac61939da0c.jpg", "wb")
picname.write(str.decode('base64'))
picname.close()
My end goal would be to look in a directory for any txt file with "jpeg" in the name, get and edit the string from it, change to image, and save the image in the same directory with the same filename ('data.image.jpeg.0bff54917a8c7.txt' to 'data.image.jpeg.0bff54917a8c7.jpg').
import fnmatch
import os
import base64
os.chdir(r'D:\Users\dubs\slidesets')
for file in os.listdir(r'D:\Users\dubs\slidesets')
if fnmatch.fnmatch(file, "*jpeg*.txt"):
newname = os.path.basename(file).replace(".txt", ".jpg")
with open(file, 'r') as file:
str = file.read().replace('data:image/jpeg;base64,', '')
picname = open("newname", "wb")
picname.write(str.decode('base64'))
picname.close()
The error that I am getting:
Traceback (most recent call last):
File "<stdin>", line 7, in <module>
AttributeError: 'str' object has no attribute 'decode'
I tried "newname" 'newname' and newname because I was unsure how that works with a variable instead, but that didn't help. Not sure why it works for one file in my top code but not in the loop?

How to read Arabic text from PDF using Python script

I have a code written in Python that reads from PDF files and convert it to text file.
The problem occurred when I tried to read Arabic text from PDF files. I know that the error is in the coding and encoding process but I don't know how to fix it.
The system converts Arabic PDF files but the text file is empty.
and display this error:
Traceback (most recent call last): File
"C:\Users\test\Downloads\pdf-txt\text maker.py", line 68, in
f.write(content) UnicodeEncodeError: 'ascii' codec can't encode character u'\xa9' in position 50: ordinal not in range(128)
Code:
import os
from os import chdir, getcwd, listdir, path
import codecs
import pyPdf
from time import strftime
def check_path(prompt):
''' (str) -> str
Verifies if the provided absolute path does exist.
'''
abs_path = raw_input(prompt)
while path.exists(abs_path) != True:
print "\nThe specified path does not exist.\n"
abs_path = raw_input(prompt)
return abs_path
print "\n"
folder = check_path("Provide absolute path for the folder: ")
list=[]
directory=folder
for root,dirs,files in os.walk(directory):
for filename in files:
if filename.endswith('.pdf'):
t=os.path.join(directory,filename)
list.append(t)
m=len(list)
print (m)
i=0
while i<=m-1:
path=list[i]
print(path)
head,tail=os.path.split(path)
var="\\"
tail=tail.replace(".pdf",".txt")
name=head+var+tail
content = ""
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file(path, "rb"))
# Iterate pages
for j in range(0, pdf.getNumPages()):
# Extract text from page and add to content
content += pdf.getPage(j).extractText() + "\n"
print strftime("%H:%M:%S"), " pdf -> txt "
f=open(name,'w')
content.encode('utf-8')
f.write(content)
f.close
i=i+1
You have a couple of problems:
content.encode('utf-8') doesn't do anything. The return value is the encoded content, but you have to assign it to a variable. Better yet, open the file with an encoding, and write Unicode strings to that file. content appears to be Unicode data.
Example (works for both Python 2 and 3):
import io
f = io.open(name,'w',encoding='utf8')
f.write(content)
If you don't close the file properly, you may see no content because the file is not flushed to disk. You have f.close not f.close(). It's better to use with, which ensures the file is closed when the block exits.
Example:
import io
with io.open(name,'w',encoding='utf8') as f:
f.write(content)
In Python 3, you don't need to import and use io.open but it still works. open is equivalent. Python 2 needs the io.open form.
you can use anthor library called pdfplumber instead of using pypdf or PyPDF2
import arabic_reshaper
from bidi.algorithm import get_display
with pdfplumber.open(r'example.pdf') as pdf:
my_page = pdf.pages[10]
thepages=my_page.extract_text()
reshaped_text = arabic_reshaper.reshape(thepages)
bidi_text = get_display(reshaped_text)
print(bidi_text)

How do I write a python script that can read doc/docx files and convert them to txt?

Basically I have a folder with plenty of .doc/.docx files. I need them in .txt format. The script should iterate over all the files in a directory, convert them to .txt files and store them in another folder.
How can I do it?
Does there exist a module that can do this?
I figured this would make an interesting quick programming project. This has only been tested on a simple .docx file containing "Hello, world!", but the train of logic should give you a place to work from to parse more complex documents.
from shutil import copyfile, rmtree
import sys
import os
import zipfile
from lxml import etree
# command format: python3 docx_to_txt.py Hello.docx
# let's get the file name
zip_dir = sys.argv[1]
# cut off the .docx, make it a .zip
zip_dir_zip_ext = os.path.splitext(zip_dir)[0] + '.zip'
# make a copy of the .docx and put it in .zip
copyfile(zip_dir, zip_dir_zip_ext)
# unzip the .zip
zip_ref = zipfile.ZipFile(zip_dir_zip_ext, 'r')
zip_ref.extractall('./temp')
# get the xml out of /word/document.xml
data = etree.parse('./temp/word/document.xml')
# we'll want to go over all 't' elements in the xml node tree.
# note that MS office uses namespaces and that the w must be defined in the namespaces dictionary args
# each :t element is the "text" of the file. that's what we're looking for
# result is a list filled with the text of each t node in the xml document model
result = [node.text.strip() for node in data.xpath("//w:t", namespaces={'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})]
# dump result into a new .txt file
with open(os.path.splitext(zip_dir)[0]+'.txt', 'w') as txt:
# join the elements of result together since txt.write can't take lists
joined_result = '\n'.join(result)
# write it into the new file
txt.write(joined_result)
# close the zip_ref file
zip_ref.close()
# get rid of our mess of working directories
rmtree('./temp')
os.remove(zip_dir_zip_ext)
I'm sure there's a more elegant or pythonic way to accomplish this. You'll need to have the file you want to convert in the same directory as the python file. Command format is python3 docx_to_txt.py file_name.docx
conda install -c conda-forge python-docx
from docx import Document
doc = Document(file)
for p in doc.paragrafs:
print(p.text)
pass
Thought I would share my approach, basically boils down to two commands that convert either .doc or .docx to a string, both options require a certain package:
import docx
import os
import glob
import subprocess
import sys
# .docx (pip3 install python-docx)
doctext = "\n".join(i.text.encode("utf-8").decode("utf-8") for i in docx.Document(infile).paragraphs)
# .doc (apt-get install antiword)
doctext = subprocess.check_output(["antiword", infile]).decode("utf-8")
I then wrap these solutions up in a function, that can either return the result as a python string, or write to a file (with the option of appending or replacing).
import docx
import os
import glob
import subprocess
import sys
def doc2txt(infile, outfile, return_string=False, append=False):
if os.path.exists(infile):
if infile.endswith(".docx"):
try:
doctext = "\n".join(i.text.encode("utf-8").decode("utf-8") for i in docx.Document(infile).paragraphs)
except Exception as e:
print("Exception in converting .docx to str: ", e)
return None
elif infile.endswith(".doc"):
try:
doctext = subprocess.check_output(["antiword", infile]).decode("utf-8")
except Exception as e:
print("Exception in converting .docx to str: ", e)
return None
else:
print("{0} is not .doc or .docx".format(infile))
return None
if return_string == True:
return doctext
else:
writemode = "a" if append==True else "w"
with open(outfile, writemode) as f:
f.write(doctext)
f.close()
else:
print("{0} does not exist".format(infile))
return None
I then would call this function via something like:
files = glob.glob("/path/to/filedir/**/*.doc*", recursive=True)
outfile = "/path/to/out.txt"
for file in files:
doc2txt(file, outfile, return_string=False, append=True)
It's not often I need to perform this operation, but up until now the script has worked for all my needs, if you find this function has a bug let me know in a comment.

Categories