Script that reads PDF metadata and writes to CSV

Script that reads PDF metadata and writes to CSV - python

I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?

writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info

Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))

Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.

Related

Not able to download files from FTP

I am trying to download files using python script from my ftp server...However i am getting the files which are of size 0 kb...i can't understand exactly where i am wrong...I am actually searching the files by a particular string in filename and then downloading all the files having that string on my ftp in a given directory.
Here is my code:
# Libraries
import re
import os
import ftplib
import ntpath
ftp = ftplib.FTP("192.168.1.786:22")
ftp.login("Marshmellow", "YourPasswordHere")
##ftp.dir("feed_1")
files = []
## F = open('Files.txt','a')
try:
files = ftp.nlst("feed_1")
for fname in files:
res = re.findall("2018-07-25", fname)
if res:
# Open the file for writing in binary mode
print 'Opening local file ' + ntpath.basename(fname)
file = open(ntpath.basename(fname), 'wb')
# Download the file a chunk at a time
# Each chunk is sent to handleDownload
# We append the chunk to the file and then print a '.' for progress
# RETR is an FTP command
print 'Getting ' + ntpath.basename(fname)
try:
ftp.retrbinary('RETR ' + ntpath.basename(fname), file.write)
except:
pass
# Clean up time
print 'Closing file ' + ntpath.basename(fname)
file.close()
print (fname)
## F.write(fname + '\n')
if not res:
continue
except ftplib.error_perm , resp:
if str(resp) == "550 No files found":
print "No files in this directory"
pass
else:
raise
## F.close()
Help Me Out if anyone knows what's wrong in this.

try:
ftp.cwd("feed_1")
files = ftp.nlst() for fname in files:
res = re.findall("2018-07-25", fname) if res:
# Open the file for writing in binary mode
print 'Opening local file ' + ntpath.basename(fname)
file = open(ntpath.basename(fname), 'wb')
i've just set the current working directory using ftp.cwd("feed_1") which i did the wrong way earlier like: files = ftp.nlst("feed_1")

Get all files from my C drive - Python

Here is what I try to do:
I would like to get a list of all files that are heavier than 35 MB in my C drive.
Here is my code:
def getAllFileFromDirectory(directory, temp):
files = os.listdir(directory)
for file in files:
if (os.path.isdir(file)):
getAllFileFromDirectory(file, temp)
elif (os.path.isfile(file) and os.path.getsize(file) > 35000000):
temp.write(os.path.abspath(file))
def getFilesOutOfTheLimit():
basePath = "C:/"
tempFile = open('temp.txt', 'w')
getAllFileFromDirectory(basePath, tempFile)
tempFile.close()
print("Get all files ... Done !")
For some reason, the interpreter doesn't go in the if-block inside 'getAllFileFromDirectory'.
Can someone tell me what I'm doing wrong and why (learning is my aim). How to fix it ?
Thanks a lot for your comments.

I fixed your code. Your problem was that os.path.isdir can only know if something is a directory if it receives the full path of it. So, I changed the code to the following and it works. Same thing for os.path.getsize and os.path.isfile.
import os
def getAllFileFromDirectory(directory, temp):
files = os.listdir(directory)
for file in files:
if (os.path.isdir(directory + file)):
if file[0] == '.': continue # i added this because i'm on a UNIX system
print(directory + file)
getAllFileFromDirectory(directory + file, temp)
elif (os.path.isfile(directory + file) and os.path.getsize(directory + file) > 35000000):
temp.write(os.path.abspath(file))
def getFilesOutOfTheLimit():
basePath = "/"
tempFile = open('temp.txt', 'w')
getAllFileFromDirectory(basePath, tempFile)
tempFile.close()
print("Get all files ... Done !")
getFilesOutOfTheLimit()

How do I fix this file_tracker that reads/writes using JSON dictionaries?

I am trying to write a script that tracks for changes made in directories/files set to multiple file paths created by an installer. I found Thomas Sileo's DirTools project on git, modified it, but am now running into some issues when writing/reading from JSON:
1) First, I believe that I am writing to JSON incorrectly and am finding that my create_state() function is only writing the last path I need.
2) If I get it working, I am unable to read/parse the file like I was before. I usually get ValueError: Extra data errors
Code below:
import os import json import getpass
files = [] subdirs = []
USER = getpass.getuser()
pathMac = ['/Applications/',
'/Users/' + USER + '/Documents/' ]
def create_dir_index(path):
files = []
subdirs = []
for root, dirs, filenames in os.walk(path):
for subdir in dirs:
subdirs.append(os.path.relpath(os.path.join(root, subdir), path))
for f in filenames:
files.append(os.path.relpath(os.path.join(root, f), path))
return dict(files=files, subdirs=subdirs)
def create_state(): for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("Manifest.json", "w")
json.dump(dir_state, out_file)
out_file.close()
def compare_states(dir_base, dir_cmp):
'''
return a comparison two manifest json files
'''
data = {}
data['deleted'] = list(set(dir_cmp['files']) - set(dir_base['files']))
data['created'] = list(set(dir_base['files']) - set(dir_cmp['files']))
data['deleted_dirs'] = list(set(dir_cmp['subdirs']) - set(dir_base['subdirs']))
data['created_dirs'] = list(set(dir_base['subdirs']) - set(dir_cmp['subdirs']))
return data
if __name__ == '__main__':
response = raw_input("Would you like to Compare or Create? ")
if response == "Create":
# CREATE MANIFEST json file
create_state()
print "Manifest file created."
elif response == "Compare":
# create the CURRENT state of all indexes in pathMac and write to json file
for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("CurrentState.json", "w")
json.dump(dir_state, out_file)
out_file.close()
# Open and Load the contents from the file into dictionaries
manifest = json.load(open("Manifest.json", "r"))
current = json.load(open("CurrentState.json", "r"))
print compare_states(current, manifest)

Python - I'm trying to unzip a file that has multiple zip files within

My goal is to get to a txt file that is withing the second layer of zip files. The issue is that the txt file has the same name in all the .zip, so it overwrites the .txt and it only returns 1 .txt
from ftplib import *
import os, shutil, glob, zipfile, xlsxwriter
ftps = FTP_TLS()
ftps.connect(host='8.8.8.8', port=23)
ftps.login(user='xxxxxxx', passwd='xxxxxxx')
print ftps.getwelcome()
print 'Access was granted'
ftps.prot_p()
ftps.cwd('DirectoryINeed')
data = ftps.nlst() #Returns a list of .zip diles
data.sort() #Sorts the thing out
theFile = data[-2] #Its a .zip file #Stores the .zip i need to retrieve
fileSize = ftps.size(theFile) #gets the size of the file
print fileSize, 'bytes' #prints the size
def grabFile():
filename = 'the.zip'
localfile = open(filename, 'wb')
ftps.retrbinary('RETR ' + theFile, localfile.write)
ftps.quit()
localfile.close()
def unzipping():
zip_files = glob.glob('*.zip')
for zip_file in zip_files:
with zipfile.ZipFile(zip_file, 'r')as Z:
Z.extractall('anotherdirectory')
grabFile()
unzipping()
lastUnzip()
After this runs it grabs the .zip that I need and extracts the contents to a folder named anotherdirectory. Where it holds the second tier of .zips. This is where I get into trouble. When I try to extract the files from each zip. They all share the same name. I end up with a single .txt when I need one for each zip.

I think you're specifying the same output directory and filename each time. In the unzipping function,
change
Z.extractall('anotherdirectory')
to
Z.extractall(zip_file)
or
Z.extractall('anotherdirectory' + zip_file)
if the zip_file's are all the same, give each output folder a unique numbered name:
before unzipping function:
count = 1
then replace the other code with this:
Z.extractall('anotherdirectory/' + str(count))
count += 1

Thanks to jeremydeanlakey's response, I was able to get this part of my script. Here is how I did it:
folderUnzip = 'DirectoryYouNeed'
zip_files = glob.glob('*.zip')
count = 1
for zip_file in zip_files:
with zipfile.ZipFile(zip_file, 'r') as Z:
Z.extractall(folderUnzip + '/' + str(count))
count += 1

File naming problem with Python

I am trying to iterate through a number .rtf files and for each file: read the file, perform some operations, and then write new files into a sub-directory as plain text files with the same name as the original file, but with .txt extensions. The problem I am having is with the file naming.
If a file is named foo.rtf, I want the new file in the subdirectory to be foo.txt. here is my code:
import glob
import os
import numpy as np
dir_path = '/Users/me/Desktop/test/'
file_suffix = '*.rtf'
output_dir = os.mkdir('sub_dir')
for item in glob.iglob(dir_path + file_suffix):
with open(item, "r") as infile:
reader = infile.readlines()
matrix = []
for row in reader:
row = str(row)
row = row.split()
row = [int(value) for value in row]
matrix.append(row)
np_matrix = np.array(matrix)
inv_matrix = np.transpose(np_matrix)
new_file_name = item.replace('*.rtf', '*.txt') # i think this line is the problem?
os.chdir(output_dir)
with open(new_file_name, mode="w") as outfile:
outfile.write(inv_matrix)
When I run this code, I get a Type Error:
TypeError: coercing to Unicode: need string or buffer, NoneType found
How can I fix my code to write new files into a subdirectory and change the file extensions from .rtf to .txt? Thanks for the help.

Instead of item.replace, check out some of the functions in the os.path module (http://docs.python.org/library/os.path.html). They're made for splitting up and recombining parts of filenames. For instance, os.path.splitext will split a filename into a file path and a file extension.
Let's say you have a file /tmp/foo.rtf and you want to move it to /tmp/foo.txt:
old_file = '/tmp/foo.rtf'
(file,ext) = os.path.splitext(old_file)
print 'File=%s Extension=%s' % (file,ext)
new_file = '%s%s' % (file,'.txt')
print 'New file = %s' % (new_file)
Or if you want the one line version:
old_file = '/tmp/foo.rtf'
new_file = '%s%s' % (os.path.splitext(old_file)[0],'.txt')

I've never used glob, but here's an alternative way without using a module:
You can easily strip the suffix using
name = name[:name.rfind('.')]
and then add the new suffix:
name = name + '.txt'
Why not using a function ?
def change_suffix(string, new_suffix):
i = string.rfind('.')
if i < 0:
raise ValueError, 'string does not have a suffix'
if not new_suffix[0] == '.':
new_suffix += '.'
return string[:i] + new_suffix

glob.iglob() yields pathnames, without the character '*'.
therefore your line should be:
new_file_name = item.replace('.rtf', '.txt')
consider working with clearer names (reserve 'filename' for a file name and use 'path' for a complete path to a file; use 'path_original' instead of 'item'), os.extsep ('.' in Windows) and os.path.splitext():
path_txt = os.extsep.join([os.path.splitext(path_original)[0], 'txt'])
now the best hint of all:
numpy can probably read your file directly:
data = np.genfromtxt(filename, unpack=True)
(see also here)
To better understand where your TypeError comes from, wrap your code in the following try/except block:
try:
(your code)
except:
import traceback
traceback.print_exc()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Script that reads PDF metadata and writes to CSV - python

writerow([file, title, subject]) should be writerow([f, title, subject]) You can use sys.exc_info() to print the details of your error http://docs.python.org/2/library/sys.html#sys.exc_info

Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.

Related

Not able to download files from FTP

Get all files from my C drive - Python

How do I fix this file_tracker that reads/writes using JSON dictionaries?

Python - I'm trying to unzip a file that has multiple zip files within

File naming problem with Python

Categories

Resources