I have a script that creates a folder called "videos" on a USB drive, moves 6,500 WMV files over to the "videos" folder. Then it's suppose to create an HTML page with hyperlinks to each file. Here is my current example that's broken. I'm trying to have it crawl the videos directory and create an HTML page with hyperlinks only to the local files on the USB drive.
#!/usr/bin/python
import os.path
import os
import shutil
import re
# Create the videos directory in the current location
# If the directory exists ignore it
def createDirectory():
directory = "videos"
if not os.path.isdir("./" + directory + "/"):
os.mkdir("./" + directory + "/")
print "Videos Folder Created."
else:
print "Video Folder Exists."
print "---------------------"
# Move all the files in the root directory with the .wmv extension
# to the videos folder
def moveVideos():
for file in os.listdir("."):
if os.path.splitext(file)[1] == ".wmv":
print "Moving:", file
shutil.move(file, os.path.join("videos", file))
def createHTML():
videoDirectory = os.listdir("videos")
f = open("videos.html", "w")
f.writelines(videoDirectory)
r = re.compile(r"(\\[^ ]+)")
print r.sub(r'\1', videoDirectory)
createDirectory()
moveVideos()
createHTML()
import cgi
def is_video_file(filename):
return filename.endswith(".wmv") # customize however you like
def createHTML():
videoDirectory = os.listdir("videos")
with open("videos.html", "w") as f:
f.write("<html><body><ul>\n")
for filename in videoDirectory:
if is_video_file(filename):
f.write('<li>%s</li>\n' %
(cgi.escape(filename, True), cgi.escape(filename)))
f.write("</ul></body></html>\n")
Don't do f.writelines(videoDirectory) and then regex. Besides you're only printing to the console with that regex subsitution.
Do
videoDirectory = os.listdir("videos")
f = open("videos.html", "w")
f.write('<html><head></head><body><ul>'
f.writelines(['<li>%s</li>' % (f, f) for f in videoDirectory])
f.write('</ul></body></html>')
def createHTML():
h = open("videos.html", 'w')
for vid in os.listdir:
path = "./videos" + vid
f = open(path, r)
h.write("<a href='"+f.name+"'>"+f.name[f.name.rfind('\\') +1 :]+"</a>")
f.close()
h.close()
print "done writing HTML file"
Related
I need a last touch from an expert !! I want to convert all pdf files in a directory to txt files. I wrote a code to create empty txt files having the same name as pdf files and a code to convert a single pdf to txt but I want to convert all files in the directory. please see the code below:
PS : I Already tried with PDFminer, and every other package and it does not work
import pandas as pd
import os
import PyPDF2
###Create empty txt files Named as pdf files ###########
path = '....\\PDF2Text\\PDF\\'
newpath = '....\\PDF2Text\\Text\\'
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
ext = f.replace('.pdf','.txt')
extpath = ext.replace(path,newpath)
ft= open(extpath ,"w+")
ft.close()
print(extpath)
##Here we Convert a single pdf file to a txt file providing pdf path and empty txt path #####
import PyPDF2
def getPDFFileContentToTXT(pdfFile):
myPDFFile = PyPDF2.PdfFileReader(pdfFile)
with open('....\\PDF2Text\\Text\\blabla.txt', 'w') as pdf_output:
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open('.....\\PDF2Text\\Text\\blabla.txt', 'r') as myPDFContent:
return myPDFContent.read().replace('\n',' ')
pdfFileContent = getPDFFileContentToTXT('.....\\PDF2Text\\PDF\\blabla.pdf')
import pandas as pd
import os
import PyPDF2
#Create empty txt files Named as pdf files
path = 'C:\\PDF2Text\\PDF\\'
newpath = 'C:\\PDF2Text\\Text\\'
# r=root, d=directories, f = files
files = []
for r, d, f in os.walk(path):
for file in f:
if '.pdf' in file:
files.append(os.path.join(r, file))
for f in files:
txt = f.replace('.pdf','.txt')
txtpath = txt.replace(path,newpath)
print(f)
ft= open(txtpath ,"w+")
ft.close()
print(txtpath)
Vpath = f.replace('.pdf','')
#print(Vpath)
myPDFFile = PyPDF2.PdfFileReader(f)
with open(txtpath, 'w') as pdf_output: #, encoding="utf-8"
for page in range (myPDFFile.getNumPages()):
data = myPDFFile.getPage(page).extractText()
pdf_output.write(data)
with open(txtpath, 'r') as myPDFContent:
myPDFContent.read().replace('\n',' ')
Have you tried Tika? Just do a pip install tika (also need to have Java 7+ installed on your system) and maybe this is the piece of code you want:
import os
from tika import parser
def read_pdf(pdf_file):
text = parser.from_file(pdf_file)['content']
return text.encode('utf-8')
def pdf_to_txt(folder_with_pdf, dest_folder):
"""
folder_with_pdf: path to your pdf's
dest_folder: path where you want .txt files saved
"""
pdf_files = []
for root, dirs, files in os.walk(folder_with_pdf):
for f in files:
if '.pdf' in f:
pdf_files.append(os.path.join(root, f))
#print(pdf_files)
for file_ in pdf_files:
text_file = os.path.splitext(os.path.basename(file_))[0]+'.txt'
with open(os.path.join(dest_folder,text_file), 'wb') as text_f:
text_f.write(read_pdf(file_))
return None
pdf_to_txt('./pdf_folder', './txt_folder') #you should see .txt files being populated in ./txt_folder
Aside: If pdf files in sub-directories of ./pdf_folder happens to have the same name (but different content) by any chance, then you will lose one (or more) .txt files.
I'm trying to implement a Python script that takes a folder from the user (can be zipped or unzipped), and search through all the files in the folder to output the specific lines that my regular expression matches. My code below works for regular unzipped folders, but I can't figure out how to do the same with zipped folders that are inputted to function. Below are my code, thanks in advance!
def myFunction(folder_name):
path = folder_name
for (path, subdirs, files) in os.walk(path):
files = [f for f in os.listdir(path) if f.endswith('.txt') or f.endswith('.log') or f.endswith('-release') or f.endswith('.out') or f.endswith('messages') or f.endswith('.zip')] # Specify here the format of files you hope to search from (ex: ".txt" or ".log")
files.sort() # file is sorted list
files = [os.path.join(path, name) for name in files] # Joins the path and the name, so the files can be opened and scanned by the open() function
# The following for loop searches all files with the selected format
for filename in files:
#print('start parsing... ' + str(datetime.datetime.now()))
matched_line = []
try:
with open(filename, 'r', encoding = 'utf-8') as f:
f = f.readlines()
except:
with open(filename, 'r') as f:
f = f.readlines()
# print('Finished parsing... ' + str(datetime.datetime.now()))
for line in f:
#0strip out \x00 from read content, in case it's encoded differently
line = line.replace('\x00', '')
RE2 = r'^Version: \d.+\d.+\d.\w\d.+'
RE3 = r'^.+version.(\d+.\d+.\d+.\d+)'
pattern2 = re.compile('('+RE2+'|'+RE3+')', re.IGNORECASE)
for match2 in pattern2.finditer(line):
matched_line.append(line)
print(line)
#Calling the function to use it
myFunction(r"SampleZippedFolder.zip")
The try and except block of my code was my attempt to open the zipped folder and read it. I'm still not very clear with how to open the zipped folder or how it works. Please let me know how I can modify my code to make it work, much appreciated!
One possibility is first determine what object type folder_name is using zipfile and os.isdir() and whichever one succeeds, get the list of files and proceed. Maybe something like this:
import zipfile, os, re
def myFunction(folder_name):
files = None # nothing yet
path = folder_name
if zipfile.is_zipfile(path):
print('ZipFile: {}'.format(path))
f = zipfile.ZipFile(path)
files = f.namelist()
# for name in f.namelist(): # debugging
# print('file: {}'.format(name))
elif os.path.isdir(path):
print('Folder: {}'.format(path))
files = os.listdir(path)
# for name in os.listdir(path): # debugging
# print('file: {}'.format(name))
# should now have a list of files
# proceed processing the files
for filename in files:
...
Here is what I try to do:
I would like to get a list of all files that are heavier than 35 MB in my C drive.
Here is my code:
def getAllFileFromDirectory(directory, temp):
files = os.listdir(directory)
for file in files:
if (os.path.isdir(file)):
getAllFileFromDirectory(file, temp)
elif (os.path.isfile(file) and os.path.getsize(file) > 35000000):
temp.write(os.path.abspath(file))
def getFilesOutOfTheLimit():
basePath = "C:/"
tempFile = open('temp.txt', 'w')
getAllFileFromDirectory(basePath, tempFile)
tempFile.close()
print("Get all files ... Done !")
For some reason, the interpreter doesn't go in the if-block inside 'getAllFileFromDirectory'.
Can someone tell me what I'm doing wrong and why (learning is my aim). How to fix it ?
Thanks a lot for your comments.
I fixed your code. Your problem was that os.path.isdir can only know if something is a directory if it receives the full path of it. So, I changed the code to the following and it works. Same thing for os.path.getsize and os.path.isfile.
import os
def getAllFileFromDirectory(directory, temp):
files = os.listdir(directory)
for file in files:
if (os.path.isdir(directory + file)):
if file[0] == '.': continue # i added this because i'm on a UNIX system
print(directory + file)
getAllFileFromDirectory(directory + file, temp)
elif (os.path.isfile(directory + file) and os.path.getsize(directory + file) > 35000000):
temp.write(os.path.abspath(file))
def getFilesOutOfTheLimit():
basePath = "/"
tempFile = open('temp.txt', 'w')
getAllFileFromDirectory(basePath, tempFile)
tempFile.close()
print("Get all files ... Done !")
getFilesOutOfTheLimit()
I am trying to write a script that tracks for changes made in directories/files set to multiple file paths created by an installer. I found Thomas Sileo's DirTools project on git, modified it, but am now running into some issues when writing/reading from JSON:
1) First, I believe that I am writing to JSON incorrectly and am finding that my create_state() function is only writing the last path I need.
2) If I get it working, I am unable to read/parse the file like I was before. I usually get ValueError: Extra data errors
Code below:
import os import json import getpass
files = [] subdirs = []
USER = getpass.getuser()
pathMac = ['/Applications/',
'/Users/' + USER + '/Documents/' ]
def create_dir_index(path):
files = []
subdirs = []
for root, dirs, filenames in os.walk(path):
for subdir in dirs:
subdirs.append(os.path.relpath(os.path.join(root, subdir), path))
for f in filenames:
files.append(os.path.relpath(os.path.join(root, f), path))
return dict(files=files, subdirs=subdirs)
def create_state(): for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("Manifest.json", "w")
json.dump(dir_state, out_file)
out_file.close()
def compare_states(dir_base, dir_cmp):
'''
return a comparison two manifest json files
'''
data = {}
data['deleted'] = list(set(dir_cmp['files']) - set(dir_base['files']))
data['created'] = list(set(dir_base['files']) - set(dir_cmp['files']))
data['deleted_dirs'] = list(set(dir_cmp['subdirs']) - set(dir_base['subdirs']))
data['created_dirs'] = list(set(dir_base['subdirs']) - set(dir_cmp['subdirs']))
return data
if __name__ == '__main__':
response = raw_input("Would you like to Compare or Create? ")
if response == "Create":
# CREATE MANIFEST json file
create_state()
print "Manifest file created."
elif response == "Compare":
# create the CURRENT state of all indexes in pathMac and write to json file
for count in xrange(len(pathMac)):
dir_state = create_dir_index(pathMac[count])
out_file = open("CurrentState.json", "w")
json.dump(dir_state, out_file)
out_file.close()
# Open and Load the contents from the file into dictionaries
manifest = json.load(open("Manifest.json", "r"))
current = json.load(open("CurrentState.json", "r"))
print compare_states(current, manifest)
I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.