Issue with renaming PDFs in Python from CSV file - python

So I have this script that reads a CSV file (the file is as follows):
test0
test1
test2
test3
Then the script takes a multipage PDF file (4 pages in this example), and splits it into 4 separate pages, naming them as 'document-pages1', 'document-pages2' etc.
What I'd like to get it to do, is name the split page files to their equivelent location in the CSV file.
So a 4 page PDF would correlate to the 4 row csv file. Unfortunately, I'm at a loss for how to implement this part.
My code so far is as follows:
from PyPDF2 import PdfFileWriter, PdfFileReader
import csv
def readcsv(filename):
ifile = open(filename, "rU")
reader = csv.reader(ifile, delimiter=";")
rownum = 0
a = []
for row in reader:
a.append(row)
rownum += 1
ifile.close()
return a
filepath = input("Filepath: ")
filename = input("File name: ")
csv = readcsv(filepath + filename)
inputpdf = PdfFileReader(open("test.pdf", "rb"))
for i in range(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
with open("document-page%s.pdf" % i, "wb") as outputStream:
output.write(outputStream)
Any help at all would be immensely appreciated.

ended up fixing my own issue as follows:
from PyPDF2 import PdfFileWriter, PdfFileReader
import csv
def readcsv(filename):
ifile = open(filename, "rU")
reader = csv.reader(ifile, delimiter=";")
rownum = 0
a = []
for row in reader:
a.append(row)
rownum += 1
ifile.close()
return a
filepath = input("Filepath: ")
filename = input("File name: ")
csv = readcsv(filepath + filename)
inputpdf = PdfFileReader(open("test.pdf", "rb"))
for i in range(inputpdf.numPages):
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
with open(str(csv[i]).translate(str.maketrans('','',"[]'")) + ".pdf", "wb") as outputStream:
output.write(outputStream)

Related

How to keep CSV headers on the chunk files after script split?

I need help on modifying this script to include headers on the output files chunks. The script uses some input to determined how many rows per file will the process split the file by. The output files does not contain headers from the original file. I'm am seeking advice on how to implement.
import csv
import os
import sys
os_path = os.path
csv_writer = csv.writer
sys_exit = sys.exit
if __name__ == '__main__':
try:
chunk_size = int(input('Input number of rows of one chunk file: '))
except ValueError:
print('Number of rows must be integer. Close.')
sys_exit()
file_path = input('Input path to .tsv file for splitting on chunks: ')
if (
not os_path.isfile(file_path) or
not file_path.endswith('.tsv')
):
print('You must input path to .tsv file for splitting.')
sys_exit()
file_name = os_path.splitext(file_path)[0]
with open(file_path, 'r', newline='', encoding='utf-8') as tsv_file:
chunk_file = None
writer = None
counter = 1
reader = csv.reader(tsv_file, delimiter='\t', quotechar='\'')
for index, chunk in enumerate(reader):
if index % chunk_size == 0:
if chunk_file is not None:
chunk_file.close()
chunk_name = '{0}_{1}.tsv'.format(file_name, counter)
chunk_file = open(chunk_name, 'w', newline='', encoding='utf-8')
counter += 1
writer = csv_writer(chunk_file, delimiter='\t', quotechar='\'')
print('File "{}" complete.'.format(chunk_name))
writer.writerow(chunk)
You can do it by reading the header row manually when opening the input file, and then writing it at the beginning of each output file — see the ADDED comments in the code below:
...
with open(file_path, 'r', newline='', encoding='utf-8') as tsv_file:
chunk_file = None
writer = None
counter = 1
reader = csv.reader(tsv_file, delimiter='\t', quotechar="'")
header = next(reader) # Read and save header row. (ADDED)
for index, chunk in enumerate(reader):
if index % chunk_size == 0:
if chunk_file is not None:
chunk_file.close()
chunk_name = '{0}_{1}.tsv'.format(file_name, counter)
chunk_file = open(chunk_name, 'w', newline='', encoding='utf-8')
writer = csv_writer(chunk_file, delimiter='\t', quotechar="'")
writer.writerow(header) # ADDED.
print('File "{}" complete.'.format(chunk_name))
counter += 1
writer.writerow(chunk)
Note that using single-quote characters for quoting means the output files are not adhering to the CSV standard: RFC 4180

How to get number only values from a specific row from different text file

I am trying to get only numbers from a particular row from 10 different text files. As an output, I want those numbers appended as a list. I'm a new learner. I would appreciate your help.
tried this one but not working
import os
import sys,re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
for filename in os.listdir(path):
with open(os.path.join(path, filename), 'r') as f:
#print (filename)
file = open(filename)
all_lines_variable = file.readlines()
sys.stdout = open("output", "a") #print output file
print(filename, all_lines_variable[line_number])
sys.stdout.close()
You can try this script, it will extract from all files line number 69 and then appends it to output.txt file:
import os
import re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
with open('output.txt', 'w') as f_out:
for file in os.listdir(path):
with open(os.path.join(path, file), 'r') as f_in:
lines = f_in.readlines()
print(' '.join(re.findall(r'\d+', lines[line_number])), file=f_out)

Can someone please tell me what am I doing wrong New to Python

My requirement is to look through a folder and combine only the csv files in it into one. The csv files are consistent but cannot be just concatenated. I am only interested in rows where the first element in the row is a date else I need to discard that row.
The code that I have made progress till now is below which as usual is not working:
import os
import csv
from dateutil.parser import parse
def datecheck(string):
try:
parse(string)
return True
except ValueError:
False
file_ext = "csv"
os.chdir ("C:\\BANK_PROJECT\\FILES\\RAW_SOURCE_FILES")
file_list = os.listdir("C:\\BANK_PROJECT\\FILES\\RAW_SOURCE_FILES")
with open("outfile.csv", "w") as outfile:
print(file_list)
for file in file_list:
if file.__contains__(".csv"):
with open(file, 'r') as infile:
data = csv.reader(infile)
for row in data:
if len(row) > 0:
if datecheck(row[0]):
outfilewriter = csv.writer(outfile)
outfilewriter.writerows(row)
else:
continue
import csv
from glob import glob
from dateutil.parser import parse
def datecheck(string):
try:
parse(string)
return True
except ValueError:
False
files = glob('*.csv')
with open('outfile.csv', 'w') as outfile:
writer = csv.writer(outfile)
for file in files:
with open(file) as infile:
reader = csv.reader(infile)
data = [row for row in reader if datecheck(row[0])]
writer.writerows(data)
Input file 1:
new,1,2,3
2012-01-19 17:21:0,1,2 ,3
2012-01-19,1,2,3
xx,2,3,4
Input File 2:
new,1,2,3
2012-03-19 17:21:0,1,2 ,3
yy,1,2,3
2012-04-19,1,2,3
xx,2,3,4
Output:
2012-01-19 17:21:0,1,2 ,3
2012-01-19,1,2,3
2012-03-19 17:21:0,1,2 ,3
2012-04-19,1,2,3

Split CSV files with headers in windows using python and remove text qualifiers from line start and end

I have a large csv file for which i need to split the file. I have
managed to split the file using the below python code:
import csv
divisor = 500000
outfileno = 1 outfile = None
with open('file_temp.txt', 'r') as infile:
for index, row in enumerate(csv.reader(infile)):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(row)
The problem i'm facing is that the file header is not getting copied
to the rest of the files. Can you please let me know how can i modify
my code to add the headers in the different splitted files.
You just need to cache the header row and then write it out for each CSV file, something like:
import csv
divisor = 500000
outfileno = 1
outfile = None
try:
with open('file_temp.txt', 'r') as infile:
infile_iter = csv.reader(infile)
header = next(infile_iter)
for index, row in enumerate(infile_iter):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
writer = csv.writer(outfile)
writer.writerow(header)
writer.writerow(row)
finally:
# Don't forget to close the last file
if outfile is not None:
outfile.close()
Since you're only working with lines, you don't really need to use the CSV module, here's a version that works without it:
divisor = 500000
outfileno = 1
outfile = None
try:
with open('file_temp.txt', 'r') as infile:
header = next(infile)
for index, row in enumerate(infile):
if index % divisor == 0:
if outfile is not None:
outfile.close()
outfilename = 'big-{}.csv'.format(outfileno)
outfile = open(outfilename, 'w')
outfileno += 1
outfile.write(header)
outfile.write(row)
finally:
# Don't forget to close the last file
if outfile is not None:
outfile.close()

Python CSV writer - writing columns in new csv file up to maximum number of fields in csv files

I have 200 CSV files in my folder.
What I am trying to do is read first row of each files and write in new csv.
And on top, I want to write [file,field1,field2,...fieldn]
n is maximum number of fields.
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
max(hel)
lst = [file] + headers
csv_writer.writerow(lst)
It came out that maximum number of fields of 200 files are 255.
So on top of new csv file, I want to write file, field1, field2 ... field 255.
How can I do this?
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
b=['field{}'.format(i) for i in range(1,max(hel)+1)]
lst = [file] + headers
csv_writer.writerow(lst)
Now b is list that looks like this ['field1','field2'...'field255']
I need to insert 'file' before 'field1' and write that row on the top of new csv file. Writing code after csv_writer.writerow(lst) gives me csv file with 'field1','field2'.. every other line. How can I fix this problem
You first need to read all your input files to determine the maximum number of fields is 255. Then you need to construct a list of field names to write into the output file (just once, not in a loop):
['field{}'.format(i) for i in range(1, 256)]
You can pass that list to the csv module to write it.
Read the field count and first line from each file before writing the file.
import glob
from itertools import chain
import os
from os.path import splitext, basename
def first_line(filepath):
with open(filepath) as f:
return next(f)
def write_test_file(dest_file_path, source_path_name):
source_paths = glob.glob(source_path_name)
first_lines = list(map(first_line, source_paths))
max_count = max(l.count(",") for l in first_lines)
field_names = map("field{}".format, range(1, max_count + 2))
header = ",".join(chain(["file"], field_names)) + os.linesep
file_names = (splitext(basename(p))[0] for p in source_paths)
content = chain([header], map(",".join, zip(file_names, first_lines)))
with open(dest_file_path, 'w') as testfile:
testfile.write("".join(content))
write_test_file('test.csv', 'C:/dataset/*.csv')

Categories