This may just be due to PyPdf2's extract text function but when I run the code below in order to rename the files, a lot of the most common words come out like "Nthe", "Nfrom" and "Ncommunications". I'm not sure what I can do to stop this happening or alternatively how to work around it.
What causes a problem like this?
Where are the N's coming from?
Other PDFs do perfectly what I want so I'm not sure where to go from here.
import PyPDF2
import re
from collections import Counter
import os.path
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
for file in files:
pdfFileObj = open('{0}'.format(file), 'rb') #Open the File
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #Read the file
frequency = {} #Create dict
ignore = {'the','a','if','in','it','of','or','and','for','can','that','are','this','your','you','will','not','have','its','with','need','has','from','more'} #Ignore dese ones
print "Number of Pages %s " % pdfReader.numPages #Print Num Pages
word_list = []
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i) # Get the first page
word_list.append(pageObj.extractText()) #Add the pages together
match_pattern = re.findall(r'\b[a-z]{3,15}\b', str(word_list)) #Find the text
cnt = Counter()
for words in match_pattern: #Start counting the frequency
words.lower() # Lower Case Words
if words not in ignore: #Ignore common words
count = frequency.get(words,0) #Start the count?
frequency[words] = count + 1 #Add one
fl = sorted(frequency, key=frequency.__getitem__, reverse = True)[:3] #Sort according to frequency
pdfFileObj.close() #Close the PDF
newtitle = ' '.join(map(str,fl, )).title() #Join the title list together
try:
print newtitle #Print the title
os.rename('{0}'.format(file), '{0}.pdf'.format(newtitle))#Rename the file
except:
print "Unable to Rename File"
Related
thank you so much for taking your time. Please see code below. The code works, but instead of searching for one word, I need to search for several words. I've tried:
search_word = ['python' , 'aws' , 'sql']
but this doesn't work. Any ideas on how to make this work?
Any suggestions to improve the code are all welcome!
Code:
directory = r"/Users/resumes_for_testing/"
# define keywords
search_word = 'python'
# Loop through all PDFs in specified directory:
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
# open the pdf file
f = open(filename,'rb')
object = PyPDF2.PdfFileReader(f)
# search for keywords
for i in range(object.numPages):
page = object.getPage(i)
text = page.extractText()
search_text = text.lower().split()
for word in search_text:
if search_word in word:
print("The word '{}' was found in '{}'".format(search_word,filename))
You could try small change in approach where instead of looping the search_text you could loop through your list of search_words and then use if statement to see whether it is in search_text
e.g.
# define keywords
search_words = ['python', 'aws', 'sql']
# Loop through all PDFs in specified directory:
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
# open the pdf file
f = open(filename,'rb')
object = PyPDF2.PdfFileReader(f)
# search for keywords
for i in range(object.numPages):
page = object.getPage(i)
text = page.extractText()
search_text = text.lower().split()
for word in search_words:
if word in search_text:
print("The word '{}' was found in '{}'".format(word, filename))
Try pdfreader to extract texts:
import os
from pdfreader import SimplePDFViewer, PageDoesNotExist
def search_in_file(fname, search_words):
fd = open(fname, "rb")
viewer = SimplePDFViewer(fd)
try:
while True:
viewer.render()
text = "".join(viewer.canvas.strings)
for word in search_words:
if word in text:
print("The word '{}' was found in '{}' on page {}".format(word, fname, viewer.current_page_number))
viewer.next()
except PageDoesNotExist:
pass
# define keywords
search_words = ['python', 'aws', 'sql']
# define directory
directory = "./"
# Loop through all PDFs in specified directory:
for fname in os.listdir(directory):
if fname.endswith(".pdf"):
search_in_file(fname, search_words)
This code works but I have to call all the files one by one, I need to call only the folder where the files are and to save the results in another folder.
I am not figuring out :( Can anybody help me, I'm new in Python. Thank you I appreciate :)
import re
import string
import sys
frequency = {}
sys.stdin = open('C:/Users/Desktop/app/data/sources/books/test.txt', 'r')
sys.stdout =open('C:/Users/Desktop/app/data/fre/news/test.txt', 'w')
text_string = sys.stdin.read()
match_pattern = re.findall(r'([-][\w]+)', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for word in frequency_list:
print (word, frequency[word])
Maybe something like this?
import glob
import os
books = glob.glob("C:/Users/Desktop/app/data/sources/books/*.txt")
# now you have a list of all .txt files in that directory.
def writer(text_string, output_file):
"""A function to write out items from an input text string"""
frequency = {}
match_pattern = re.findall(r'([-][\w]+)', text_string)
for word in match_pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for word in frequency_list:
print(word, frequency[word], file=open(output_file, "a"))
# now you have a function that essentially does the procedure you already know works
for book in books:
book_name = os.path.split(book)[-1] # get <filename>.txt from the path
# context manager will close the stream when you're done
with open(book, "r") as file:
text_string = file.read()
output_file = "C:/Users/Desktop/app/data/fre/news/" + book_name
writer(text_string, output_file)
This code will iterate through the .txt files in the directory you were reading from.
I encapsulated your working code in a function (somewhat reformatted for clarity, you can specify where to print to directly from the print function), so as you iterate through the files you can read them in and drop them through the working code.
I am trying to merge all the values into one list when I run my for loop. However I keep getting to separate brackets in one list.
For example, when I run this code:
import glob
import re
#import PyPDF2
folder_path='/Users/my_path/cb_files'
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern, recursive=True)
#IP Bank
import re
ip = re.compile(r"((?:^|\b)(?:h[tTxX]ps?://)?(?:\d{1,3}\[?\.\]?){3}\d{1,3}(?:\b|$))")
hash_ = re.compile(r"((?:^|\b)[a-fA-F0-9]{32,64}(?:\b|$))")
domain = re.compile(r"((?:^|\b)(?:h[xXtT]{2}ps?:|meows?:)?//(?:[a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDFC\uFDF0-\uFFEF_.\[\]-]+)(?:\[?\.\]?[a-z]+)+(?::\d+)?(?:[/?][^\s]*)?)")
ip_list=[]
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
read_file=pageObj.extractText()
elif '.' not in file:
continue
else:
read_file = open(file, 'rt', encoding="latin-1").read()
if ip.findall(read_file) or hash_.findall(read_file) or domain.findall(read_file):
ips =ip.findall(read_file)
hashs= hash_.findall(read_file)
domains=domain.findall(read_file)
# print("IPS",', '.join(ips))
ip_list.append(ips)
print(ip_list)
Here is my output:
[['000.000.0.1', '111.111.1.1'], ['222.222.2.2','333.333.3.3']]
So it looks like for each file it loops over, it is putting it in its own list.
I want the output to look like this:
['000.000.0.1', '111.111.1.1','222.222.2.2','333.333.3.3']
Any changes in my code that will produce these results?
Try changing:-
ip_list.append(ips)
to
ip_list.extend(ips)
The following Python program reads a PDF file and collects unique words used in that file.
import PyPDF2
import re
print('process started')
pdfFile = open('pdf_file.pdf', 'rb')
pdfFileReader = PyPDF2.PdfFileReader(pdfFile)
pdfFilePageCount = pdfFileReader.numPages
pdfPageText = ""
for i in range(pdfFilePageCount):
pageObj = pdfFileReader.getPage(i)
pdfPageText += pageObj.extractText()
pdfPageText = pdfPageText.lower()
pdfPageText = re.sub(r'[^a-zA-Z]+', ' ', pdfPageText)
listOfWords = pdfPageText.split()
setPage = set(listOfWords)
textFile = open('text_file.txt', 'w')
for item in listOfWords:
textFile.write("%s\n" % item)
textFile.close()
pdfFile.close()
print('process ended')
Is it possible to add page numbers with each word (i.e. to indicate from which page they were picked)?
For instance, if a word "xyzabc" is found in multiple pages, I need to list them all as follows:
xyzabc (1,22,130, ...)
You can create a dict whose keys are the words, and values a list of page numbers.
Using a defaultdict makes it easy to append the page numbers.
You also have to update the dict in each loop, as shown in the modified code here:
import PyPDF2
import re
from collections import defaultdict
print('process started')
pdfFile = open('pdf_file.pdf', 'rb')
pdfFileReader = PyPDF2.PdfFileReader(pdfFile)
pdfFilePageCount = pdfFileReader.numPages
wordsAndPages = defaultdict(list)
pdfPageText = ""
for page in range(pdfFilePageCount):
pageObj = pdfFileReader.getPage(page)
pdfPageText = pageObj.extractText()
print(pdfPageText)
pdfPageText = pdfPageText.lower()
pdfPageText = re.sub(r'[^a-zA-Z]+', ' ', pdfPageText)
listOfWords = pdfPageText.split()
for word in listOfWords:
wordsAndPages[word].append(page)
textFile = open('text_file.txt', 'w')
for word, pages in wordsAndPages.items():
output = '{} ({})\n'.format(word, ','.join([str(page) for page in pages]))
textFile.write(output)
textFile.close()
pdfFile.close()
print('process ended')
and you can finally output the data in the expected format.
# coding=utf-8
# Libreria RegEx de Python.
import re
# Libreria para rutas.
import os
import csv
# function betwwen: return the value between two words a and b
def between(value, a, b):
pos_a = value.find(a) # Find and validate before-part.
if pos_a == -1: return "" # Find and validate after part.
pos_b = value.rfind(b)
if pos_b == -1: return "" # Return middle part.
adjusted_pos_a = pos_a + len(a)
if adjusted_pos_a >= pos_b: return ""
return value[adjusted_pos_a:pos_b]
# function scan folder DiarioOficial
def scan_folder():
# directory 'path'
path = '/Users/anna/PycharmProjects/extractData/DiarioOficial'
# contador de ficheros del path
count = 0
# creation csv as csvFile
with open('All_Companies1.csv', 'a') as csvFile:
# iterate all paths in the folder DiarioOficial without name
for (path, dirnames, file_names) in os.walk(path):
# iterate over all the files in the path (+ file_name)
for file_name in file_names:
# Add extension that is required
if file_name.endswith(".txt"):
# summatory count files in DiarioOficial folder
count = count + 1
# concatenation path + file name
file_path=os.path.join(path, file_name)
#print(file_path)
# open and read the file path
mensaje = open(file_path).read()
# Replace a newline for a space
mensaje = mensaje.replace("\n","")
# Company Name
keywords_cap = ['SpA', 'SPA', 'LIMITADA', 'LTDA', 'S.A.', 'E.I.R.L.', 'S.L.']
# re.escape to solve the problem with metacharacters in keyword_obj
keywords_cap = map(re.escape, keywords_cap)
# sorting the items by lengh in descending order
keywords_cap.sort(key=len, reverse=True)
obj = re.compile(r'[:,;.]\s*"?([^:,;.]*?(?<!\w)(?:{}))'.format('|'.join(keywords_cap)))
if obj:
# To obtain the first match obj.search(mensaje).group(1)
company_name = obj.search(mensaje)
else:
company_name = "None"
# CVE Number of the file
regex = r"\s*CVE\s+([^|]*)"
matches = re.search(regex, mensaje)
if matches:
company_cve = matches.group(1).strip()
else:
company_cve = "None"
# Section of diariooficial.interior.gob.cl
company_sect = between(mensaje, 'SECCIÓN', 'Núm.')
if company_sect:
company_sect = company_sect
else:
company_sect = "None"
# Name of the person that constitutes the company
company_ceo = re.search(r'\sante mí,\s+([^,]*)', mensaje)
if company_ceo:
company_ceo = company_ceo.group(1)
else:
company_ceo = "None"
# File Number from Section
num_reg = r'\sNúm.\s+([^|]*)'
match_num = re.search(num_reg, mensaje)
if match_num:
company_numsect = match_num.group(1)
else:
company_numsect = "None"
# Social Capital ($)
cap = r"\s*(CAPITAL:\s+([^-]*)|Capital social:\s+([^-]*)|Capital:\s+([^-]*)|Capital:\s+([^,]*))"
caps = re.search(cap, mensaje)
if caps:
company_capital = caps.group()
else:
company_capital = 'None'
csvData = [company_name, company_cve, company_sect, company_ceo, company_numsect, company_capital]
headers = ['COMPANY NAME', 'CVE', 'SECTION','CEO NAME','NUMBER SECTOR','COMPANY CAPITAL']
writer = csv.writer(csvFile, delimiter=',') # create a csv delimited by comma
writer.writerow(headers) # print the header row
writer.writerow(csvData) # print the Data in csv
# Number of txt files
print (count)
scan_folder()
I have this script that create a csv with the data extracted from a text in specific path. In spite of the errors that can be on RegEx, mainly it extracts parts of text that it keeps them in variables and the printa in a csv. Each company must have a single line in this csv. In this way, when the csv is opened, the number of companies and all the information can be visualized by variables.
My problem is that when I see the CSV called, in this case, All_companies1, the data is not put in the same row, they jump.
Also, the titles are repeated, and I do not want them to repeat themselves
First try changing the mode for the csvFile from a (append) to w (write), also check if the editor you're using actual uses the comma as the column delimiter for csv files, since in the above picture is seems as if the comma is seen by the editor as a normal character.
Also remove any carriage return characters (\n \r) from your string before printing it, this can be done in the following code.
csvData = [str(data).replace('\n', '').replace('\r', '') for data in csvData]
Note:
if by any chance this works, there might be a problem with with having empty rows in the csv file beteen each two elements, this can be fixed by changing with open('All_Companies1.csv', 'a') as csvFile to with open('All_Companies1.csv', 'a', newline='') as csvFile