PyPDF2 give me blank pages in merged PDF

PyPDF2 give me blank pages in merged PDF - python

I have earlier come up with this question in here:
pypdf2-merging-pdf-pages-issue
Where I have now come a long way and can now create my PDF files from an Excel document via Pandas into PyPDF2.
As well as where I now have the number of pages that must be per. PDF.
However, my problem now is that my merged PDF files are now blank.
If I do a debug, then I can see that in my second loop, which contains the variable "paths" the right paths to my physical PDF files.
But that when they then come in through:
with path.open('rb') as pdf:
pdf_writer.append(pdf)
Then suddenly an extra "" enters the paths so that a path can be named c: \ users \ .... then suddenly it is called c: \ users \ ...
Do not know if this is what prevents the files from being opened and read correctly, and then merged into one PDF file.
Hope some can guide me as python for me is self taught.
Or in some other way can explain to me why I get created some merged PDF files that are suddenly blank on 3 pages.
My code is:
import datetime #Handle date
import pandas as pd #Handle data from Excel Sheet (Data analysis)
import PyPDF2 as pdf2 #Handle PDF read and merging
from pathlib import Path #Handle path
#Skip ERROR-message: Xref table not zero-indexed. ID numbers for objects will be corrected.
#import sys
#if not sys.warnoptions:
# import warnings
# warnings.simplefilter("ignore")
PDF_PATH = Path('C:/Users/TH/PDF/')
EXCEL_FILENAME = 'Resources/liste.xlsx'
def main():
today = datetime.date.today() # The date now
next_week = today.isocalendar()[1] + 1 # 0=Year, 1=week
resources = pd.read_excel(EXCEL_FILENAME, sheet_name='Ark1')
for row in resources.itertuples():
year = row.Aargang
paths = [
(PDF_PATH / row.Oevelse1).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse2).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse3).with_suffix('.pdf'),
]
pdf_writer = pdf2.PdfFileMerger()
for path in paths:
with path.open('rb') as pdf:
pdf_writer.append(pdf)
with open(f'Uge {next_week} - {year} Merged_doc.pdf', 'wb') as output:
pdf_writer.write(output)
if __name__ == '__main__':
main()

#anon01 Thx
And Thx/credit to Sirius3.
It's something about the PyPDF2, how to use it and some bugs with it.
So after edit the code to this it work.
import datetime #Handle date
import pandas as pd #Handle data from Excel Sheet (Data analysis)
from PyPDF2 import PdfFileMerger #Handle PDF read and merging
from pathlib import Path #Handle path
#Skip ERROR-message: Xref table not zero-indexed. ID numbers for objects will be corrected.
#import sys
#if not sys.warnoptions:
# import warnings
# warnings.simplefilter("ignore")
PDF_PATH = Path('C:/Users/TH/PDF')
EXCEL_FILENAME = 'Resources/liste.xlsx'
def main():
today = datetime.date.today() # The date now
next_week = today.isocalendar()[1] + 1 # 0=Year, 1=week
resources = pd.read_excel(EXCEL_FILENAME, sheet_name='Ark1')
for row in resources.itertuples():
year = row.Aargang
paths = [
(PDF_PATH / row.Oevelse1).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse2).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse3).with_suffix('.pdf'),
]
pdf_merger = PdfFileMerger()
for path in paths:
pdf_merger.append(str(path))
with open(f'Uge {next_week} - {year} Merged_doc.pdf', 'wb') as output:
pdf_merger.write(output)
pdf_merger.close()
if __name__ == '__main__':
main()

Related

While obtaining hash files, some folders and files from the directory are not showing up

My code was working just fine before adding the hash function. I was getting the list of all folders and files in my directory in the Pretty Table. Once I added the hash function, I got maybe 5 of the files in that directory with hashes in the table. I am not sure where I have gone wrong. Please forgive me, I am new to this. We are not learning to code from scratch, but have to modify existing codes to function the way we need it to.
# Python Standard Libaries
import os #file system methode
import hashlib #hashing function
import sys #system methods
import time #time conversions
# Python 3rd Party Libraries
from prettytable import PrettyTable # pip install prettytable
# Local Functions
def GetFileMetaData(fileName):
#obtain file system metadata
try:
metaData = os.stat(fileName) # Use the stat method to obtain meta data
fileSize = metaData.st_size # Extract fileSize and MAC Times
timeLastAccess = metaData.st_atime
timeLastModified = metaData.st_mtime
timeCreated = metaData.st_ctime
macTimeList = [timeLastModified, timeCreated, timeLastAccess] # Group the MAC Times in a List
return True, None, fileSize, macTimeList
except Exception as err:
return False, str(err), None, None
# Psuedo Constants
# Start of the Script
tbl = PrettyTable(['FilePath','FileSize','UTC-Modified', 'UTC-Accessed', 'UTC-Created', 'SHA-256 HASH'])
#file check
while True:
targetFolder = input("Enter Target Folder: ")
if os.path.isdir(targetFolder):
break
else:
print("\nInvalid Folder ... Please Try Again")
print("Walking: ", targetFolder, "\n")
print()
for currentRoot, dirList, fileList in os.walk(targetFolder):
for nextFile in fileList:
fullPath = os.path.join(currentRoot, nextFile)
absPath = os.path.abspath(fullPath)
fileSize = os.path.getsize(absPath)
success, errInfo, fileSize, macList = GetFileMetaData(absPath)
if success:
#convert to readable Greenich Time
modTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[0]))
accTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[1]))
creTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[2]))
#hashing function
with open(absPath, 'rb') as target:
fileContents = target.read()
sha256Obj = hashlib.sha256()
sha256Obj.update(fileContents)
hexDigest = sha256Obj.hexdigest()
tbl.add_row( [ absPath, fileSize,modTime, accTime, creTime, hexDigest] )
tbl.align = "l" # align the columns left justified
# display the table
print (tbl.get_string(sortby="FileSize", reversesort=True))
print("\nScript-End\n")

How to print a chart from an Excel Chartsheet using Python

I'm completely new to coding (it's just for fun and hopefully to save some time at work) and I've been trying to make my first lines of code working.
Specifically, I want my code to open a certain Excel workbook, find certain worksheets which are actually chartsheets (each one with only one chart in it) and print them as pdf/jpeg files in a specific folder. I went for the ExportAsFixedFormat, but I encountered the following error.
AttributeError: 'Chartsheet' object has no attribute 'ExportAsFixedFormat'
Could you please help me? Is there any way to print/save a Chartsheet?
I went through the Chartsheet Object's methods, but I couldn't find anything helpful. I'm sure I'm missing something.
Some info about my configuration:
Windows 10 Home x64
Excel for Microsoft 365 MSO (16.0.13628.20318) 64 bit
Python 3.8 32 bit
Pywin32 version 227
Below the chunk of code that I'm having problems with.
[Edit]: below the whole code I wrote, maybe the error is not where I think it is.
Thank you in advance and sorry for my broken English.
First of all, I've imported a ton of things, I'm aware I most probably need just half of them.
import plotly
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import win32com.client as win32
import openpyxl
import os, sys
import math
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl import chart
from openpyxl import chartsheet
from openpyxl.chartsheet.publish import WebPublishItem, WebPublishItems
from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
#from .drawings import find_images
from openpyxl.chartsheet import Chartsheet
import openpyxl.chart
import win32com.client
from pdf2image import convert_from_path
from pathlib import Path
import xlsxwriter
And here is the code I wrote:
path_filePy = Path(__file__).resolve()
current_folder = path_filePy.parent
image_folder_name = "Immages"
image_folder_path = os.path.join(current_folder, image_folder_name)
try:
os.mkdir(image_folder_path)
except OSError:
files = os.listdir(image_folder_path)
for f in files:
os.remove(image_folder_path + '\\'+ f)
folder_list = os.listdir(current_folder)
excel_list=[]
for l in folder_list:
if l.endswith('.xlsx'):
excel_list.append(l)
chartsheets_names=['Chartsheet1', 'Chartsheet2', 'Chartsheet3', 'Chartsheet4']
excel = win32.gencache.EnsureDispatch('Excel.Application')
for excelfile in excel_list:
wb = load_workbook(os.path.join(current_folder, excelfile))
for sheet in chartsheets_names:
ws=wb[sheet]
image_file_name = excelfile[:-5]+'_'+sheet+'.pdf'
image_file_path = os.path.join(image_folder_path,image_file_name)
ws.ExportAsFixedFormat(0, image_file_path)
convert_from_path(image_file_path, dpi=300, output_folder=image_folder_path,fmt='jpeg')
wb.Close()

I managed to get what I wanted in the end. Below is the code I'm using now, maybe it could be helpful to someone else too.
I think I was messing with code related to win32com and code related to openpxl.
Now I would like my Chartsheets to stretch all over the printing area prior to printing (I tried to set margins to zero, it does not work). I think I should use wb_sheet.PageSetup.ChartSize with the value FullPage, but I do not get how to assign it.
import os
import sys
from pathlib import Path
import win32com.client as w3c
from pdf2image import convert_from_path
# find the parent folder of the .py file
path_filePy = Path(__file__).resolve()
current_folder = path_filePy.parent
print(current_folder)
# create the destination folder or empty it if existing
image_folder_name = "Immages"
image_folder_path = os.path.join(current_folder, image_folder_name)
#print(image_folder_path)
try:
os.mkdir(image_folder_path)
except OSError:
files = os.listdir(image_folder_path)
for f in files:
os.remove(image_folder_path + '\\'+ f)
# list of file in the folder
folder_list = os.listdir(current_folder)
# list of only *.xlsx files
excel_list=[]
for l in folder_list:
if l.endswith('.xlsx'):
excel_list.append(l)
# listof sheets' names I want to print
chartsheets_names=['Sheet1', 'Sheet2', 'Sheet3', 'Sheet4']
o = w3c.Dispatch("Excel.Application")
o.Visible = False
# for each sheet names as in my list, in each xlsx file, it prints in both pdf and jpeg
for excel_file in excel_list:
try:
wb_path = os.path.join(os.path.abspath(current_folder), excel_file)
wb = o.Workbooks.Open(wb_path)
for wb_sheet in wb.Sheets:
if wb_sheet.Name in chartsheets_names:
path_to_pdf = os.path.join(os.path.abspath(image_folder_path), excel_file[:-5] + ' - ' + str(wb_sheet.Name) + '.pdf')
wb_sheet.SaveAs(path_to_pdf, FileFormat=57)
convert_from_path(
path_to_pdf, # the input pdf file
dpi=300,
output_folder=image_folder_path,
fmt='jpeg',
output_file=str(excel_file[:-5] + ' - ' + str(wb_sheet.Name)),
poppler_path = r"C:\where\your\poppler\bin folder is",
use_pdftocairo=False)
else: next
wb.Close(False)
except OSError:
next
o.Quit
`

See updated answer further down.
See further update below.
After pip install comtypes this works for me:
import os
import comtypes.client
SOURCE_DIR = r'C:\Users\xyz\SO-samples' # adjust to your needs
TARGET_DIR = r'C:\Users\xyz\SO-samples' # adjust to your needs
app = comtypes.client.CreateObject('Excel.Application')
app.Visible = False
infile = os.path.join(os.path.abspath(SOURCE_DIR), 'an-excel-file.xlsx')
outfile = os.path.join(os.path.abspath(TARGET_DIR), 'an-excel-file.pdf')
doc = app.Workbooks.Open(infile)
doc.ExportAsFixedFormat(0, outfile, 1, 0)
doc.Close()
app.Quit()
Updated answer - selectable sheets:
import os
import win32com.client
SOURCE_DIR = r'C:\Users\xyz\SO-samples' # adjust
TARGET_DIR = r'C:\Users\xyz\SO-samples' # adjust
wb_path = os.path.join(os.path.abspath(SOURCE_DIR), 'xyzzy.xlsx')
wb = o.Workbooks.Open(wb_path)
o = win32com.client.Dispatch("Excel.Application")
o.Visible = False
# print 1 sheet to 1 file
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy2.pdf')
ws_index_list = [2] # say you want to print this sheet
wb.WorkSheets(ws_index_list).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
# print 2 sheets to 1 file
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-1-3.pdf')
ws_index_list = [1,3] # say you want to print these sheets
wb.WorkSheets(ws_index_list).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
# print 3 sheets to 1 file each
ws_index_list = [1,2,3] # say you want to print these sheets
for ws_index in ws_index_list:
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-' + str(ws_index) + '.pdf')
wb.WorkSheets([ws_index]).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
# select sheet by name, print 1 sheet to 1 file
ws_sheet_name = 'named_sheet'
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-' + ws_sheet_name + '.pdf')
wb.WorkSheets(ws_sheet_name).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
Further update - printing sheet names, select sheet by name:
import win32com.client as w3c
import os, sys
SOURCE_DIR = r'C:\Users\xyz\SO-samples'
TARGET_DIR = r'C:\Users\xyz\SO-samples'
wb_path = os.path.join(os.path.abspath(SOURCE_DIR), 'xyzzy.xlsx')
o = w3c.Dispatch("Excel.Application")
o.Visible = False
wb = o.Workbooks.Open(wb_path)
for wb_sheet in wb.Sheets:
print(wb_sheet.Name)
### this works
ws_sheet_name = [1,3]
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy' + '.pdf')
wb.Worksheets(ws_sheet_name).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
### this works
ws_sheet_name = 'xyzzy'
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-xyzzy' + '.pdf')
wb.Worksheets(ws_sheet_name).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
wb.Close()

Loop through multiple CSV files and run a script

I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out

You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.

Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput

you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'

try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')

Reading multiple excel files and writting it to multiple excel files in python

I have written code where it is reading excel file and then after processing required function I want to write it to Excel file . Now I have done this for one excel file . and now my question is when I want to do it for multiple excel file that is reading multiple excel file and then output should be also in multiple excel file how will I apply for loop here so I get separate output excel file for each input file
Following is my code
from ParallelP import *
import time,json
import pandas as pd
if __name__ == '__main__':
__ip__ = "ip/"
__op__ = "op/"
__excel_file_name__ = __ip__ + '80chars.xlsx'
__prediction_op__ = __op__ + basename(__excel_file_name__) + "_processed.xlsx"
df = pd.read_excel(__excel_file_name__)
start_time = time.time()
df_preprocessed = run(df)
print("Time Needed to execute all data is {0} seconds".format((time.time() - start_time)))
print("Done...")
df_preprocessed.to_excel(__prediction_op__)

I tried to stick to your example and just expand it as I would do it. The below example is untested and does not mean that it is the best way to do it!
from ParallelP import *
import time,json
import pandas as pd
import os
from pathlib import Path # Handles directory paths -> less error prone than manually sticking together paths
if __name__ == '__main__':
__ip__ = "ip/"
__op__ = "op/"
# Get a list of all excel files in a given directory
excel_file_list = [f for f in os.listdir(__ip__) if f.endswith('.xlsx')]
# Loop over the list and process each excel file seperately
for excel_file in excel_file_list:
excel_file_path = Path(__ip__, excel_file) # Create the file path
df = pd.read_excel(str(excel_file)) # Read the excel file to data frame
start_time = time.time()
df_preprocessed = run(df) # Run your routine
print("Time Needed to execute all data is {0} seconds".format((time.time() - start_time)))
print("Done...")
# Create the output file name
prediction_output_file_name = '{}__processed.xlsx'.format(str(excel_file_path.resolve().stem))
# Create the output file path
prediction_output_file_path = str(Path(__op__, prediction_output_file_name))
# Write the output to the excel file
df_preprocessed.to_excel(prediction_output_file_path)
Sidenote: I have to mention that your variable names feel like a misuse of the __ . These 'dunder' functions are special and indicate that they have a meaning for python (see for example here). Please, just name your variables input_dir and output_dir instead of __ip__ and __op__, respectively.

I do have some code i wrote. Maybe you can alter this for your requirements.
# This is where your input file should be
in_folder = 'input/xls/file/folder'
# This will be your output folder
out_folder = 'output/xls/file/folder'
if not os.path.exists(out_folder):
os.makedirs(out_folder)
file_exist = False
dir_list = os.listdir(in_folder)
for xlfile in dir_list:
if xlfile.endswith('.xlsx') or xlfile.endswith('.xls'):
file_exist = True
str_file = os.path.join(in_folder, xlfile)
#work_book = load_workbook(filename=str_file)
#work_sheet = work_book['qa']
#Do ur work hear with excel
#out_Path = os.path.join(out_folder,)
#and output it to the out_Path
if not file_exist:
print('cannot find any valid excel file in the folder ' + in_folder)

Finding text files with specific matching numbers or string in the filename

I have imported a file with the column head serial numbers Head serial #
This is a list of serial numbers:
Head serial #
UG0013
UG0025
UG0043
UG0053
UG5214
UG5246
UT5324
UT0244
TH7035
TH7106
TH7212
TH7218
TH7362
C499277BT433
D499241BD221
D499227BQ004
B500438BZ921
B500425BZ933
I need to find all the text files on a network folder that have these numbers in the filename. Please help!
Here is my code so far which is currently returning ALL .txt files but I only want the files with the above serial numbers in the name Thanks in advance!
import matplotlib.pyplot as plot
import pandas as pd
import xlrd
""" This is the master file for reading the lifetest lasers """
masterfile_location = 'C:/Users/gallachj/Documents/Lifetest_Master.xlsx'
#df = pd.read_excel(masterfile_location)
from pandas import ExcelWriter
from pandas import ExcelFile
df = pd.read_excel(masterfile_location, sheet_name='Sheet1')
#print("Column headings:")
#print(df.columns)
#print(df['Head serial #'])
sns = df['Head serial #']
headtypes = df['Head type']
colors = df['Wavelength (nm)']
powers = df['Power rating (W)']
import fnmatch
import os
os.chdir('C:/Users/gallachj/Documents/')
for file in os.listdir('.'):
if fnmatch.fnmatch(file, '*.txt'):
# if fnmatch.filter(sns, '*.txt')
(print(file))`

based on this answer from Noufal Ibrahim, you can try something like this:
import os
def find_filenames(d, s):
try:
files = os.listdir(d)
except PermissionError: # network drives usually have a lot inaccessible folders
return []
matched_files = []
for f in files: # loop over elements in folder
full_name = os.path.join(d, f) # get full relative name
if os.path.isdir(full_name): # recursive call
matched_files += find_filenames(full_name, s)
elif os.path.isfile(full_name):
if any(serial in f for serial in s): # check the filename
matched_files.append(os.path.realpath(f)) # Remember the matched file
return matched_files # Return a list of matched files
find_filenames(r'\\40i2039p-0\d\_Mer_', ['search', 'words'])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyPDF2 give me blank pages in merged PDF - python

Related

While obtaining hash files, some folders and files from the directory are not showing up

How to print a chart from an Excel Chartsheet using Python

Loop through multiple CSV files and run a script

Reading multiple excel files and writting it to multiple excel files in python

Finding text files with specific matching numbers or string in the filename

Categories

Resources