Exporting data from Word tables to Excel using Python - python

I have thousands of word documents that have one table on the first page with information that I need outputted into Excel. So far I have this, but I'm not sure why it's not working
import win32com.client as win32
import os
myDir = r'C:\Projects\Capital'
XL = win32.Dispatch('Excel.Application')
XL.Visible = 1
XLbook = XL.Workbooks.Open(os.path.join(myDir,'Dealflow.xlsx'))
XLsheet = XLbook.Worksheets(1)
XLrow = 2
for myFile in os.listdir(myDir):
filepath = os.path.join(myDir,myFile)
filename = os.path.splitext(myFile)[0]
ext = os.path.splitext(myFile)[1]
if ext == '.docx':
word = win32.Dispatch('Word.Application')
word.Visible = 1
word.Documents.Open(filepath)
doc = word.ActiveDocument
table = doc.Tables(1)
XLsheet.Cells(XLrow,1).Value = table.Cell(Row=1, Column=1).Range.Text
XLsheet.Cells(XLrow,2).Value = table.Cell(Row=2, Column=3).Range.Text
XLrow = XLrow +1
doc.Close()

Related

Trying to extract a range of PDF page numbers from a pdf page to split and save it as a separate file using Python

I am trying to create a range of page numbers from a pdf file and then split and save them as a separate file.
Below is the code written for it.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import re
def pdf_splitter(pdf_path):
directory = "C:\\Users\\Docs\\"
fname = os.path.splitext(os.path.basename(pdf_path))[0]
print(fname)
object = PdfFileReader(pdf_path)
NumPages = object.getNumPages()
print(NumPages)
string = "Challenge 1:"
string2 = "Challenge 2:"
res=0
pageList=[]
for txt in range(0,NumPages):
pdf_writer = PdfFileWriter()
pageObject = object.getPage(txt)
Text = pageObject.extractText()
print(Text)
acc_pos = Text.find(string)
print(acc_pos)
Cur_pos = Text.find(string2)
print(Cur_pos)
loanAcctName = Text[acc_pos+12:Cur_pos]
print (loanAcctName)
# pageList.append(txt)
# print(pageList)
ReSearch = re.search(string, Text)
if ReSearch != None:
pageList.append(txt)
elif ReSearch ==None:
pageList.append(txt)
print(pageList)
res = res + 1
pdf_writer.addPage(object.getPage(page_num))
output_filename = '{}_page_{}.pdf'.format(loanAcctName,page + 1)
with open(output_filename, 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format(output_filename))
out.close()
res = res + 1
if __name__ == '__main__':
pdf_path = r"C:\Users\FY22.pdf"
pdf_splitter(pdf_path)

Is there a way to rename file names in a folder by matching an Excel file column?

Example would be I have a file PDF and it's just John Smith. In the Excel file I have John Smith and in another column John Smith's Tax ID number. How do I get John Smith 5555555? Would like to rename all file names from matching the name in an Excel file.def rename_file(file_to_rename, source_file):
def rename_file(file_to_rename, source_file):
p = Path('C:\Users\Chris\Box\Capital\Fund, 2 L.P\Tax')
filename = p.stem
wb = xlrd.open_workbook('C:\Users\Chris\Box\Capital\Fund, 2 L.P\Tax\Fund 2 LP - Document Uploader ID 2021')
sheet = wb.sheet_by_index(0)
for row_num in range(sheet.nrows):
row_value = sheet.row_values(row_num)
col = 4 # 'john smith' col number
if row_value[col] == filename:
new_filename = f'{row_value[col]}_{row_value[col - 1]}' # format as you want
p.rename(Path(p.parent, new_filename + p.suffix)) # rename
break
def get_paths_in_directory(directory):
return Path(directory).glob('*.pdf')
if __name__ == "__main__":
source_file = "Fund 2 LP - Document Uploader ID 2021.xlsx" # excel file to get new filename
source_directory = "C:\Users\Chris\Box\Capital\Fund, 2 L.P\Tax\2021 Final" # directory where your files to rename are.
# iterate all pdf files in the given directory
paths = get_paths_in_directory(source_directory)
for file_to_rename in paths:
rename_file(str(file_to_rename), source_file)
Something like this will work. (You need to pip install the packages)
import xlrd
from pathlib import Path
def rename_file(file_to_rename, source_file):
p = Path(file_to_rename)
filename = p.stem
wb = xlrd.open_workbook(source_file)
sheet = wb.sheet_by_index(0)
for row_num in range(sheet.nrows):
row_value = sheet.row_values(row_num)
col = 0 # 'john smith' col number
if row_value[col] == filename:
new_filename = f'{row_value[col]}_{row_value[col + 1]}' # format as you want
p.rename(Path(p.parent, new_filename + p.suffix)) # rename
break
if __name__ == "__main__":
# add iteration to rename multiple files
file_to_rename = "john.pdf"
source_file = "new_names.xlsx"
rename_file(file_to_rename, source_file)
You can add iteration if you want to rename multiple files.
Hope this helps!
--
Edit
Since you asked for more details, I'll add a simple iteration example.
import xlrd
from pathlib import Path
def rename_file(file_to_rename, source_file):
p = Path(file_to_rename)
filename = p.stem
wb = xlrd.open_workbook(source_file)
sheet = wb.sheet_by_index(0)
for row_num in range(sheet.nrows):
row_value = sheet.row_values(row_num)
col = 0 # 'john smith' col number
if row_value[col] == filename:
new_filename = f'{row_value[col]}_{row_value[col + 1]}' # format as you want
p.rename(Path(p.parent, new_filename + p.suffix)) # rename
break
def get_paths_in_directory(directory):
return Path(directory).glob('*.pdf')
if __name__ == "__main__":
source_file = "new_names.xlsx" # excel file to get new filename
source_directory = "files_to_rename" # directory where your files to rename are.
# iterate all pdf files in the given directory
paths = get_paths_in_directory(source_directory)
for file_to_rename in paths:
rename_file(str(file_to_rename), source_file)
And, of course, create your script file (e.g. 'rename_files.py'), copy and paste this code, and run it by python3 rename_files.py
let me know if you got more questions.

extracting text from multiple pdf files from a folder in python

I am trying to extract text from multiple pdf files which will serve as the knowledge base for a closed domain chatbot. I used this code
import pandas as pd
import PyPDF2
import glob
pdf_dir = "C:/Users/Arush/OneDrive/Desktop/sample"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
output_data = pd.DataFrame(index = [0], columns = ['FileName','Text'])
fileIndex = 0
for file in pdf_files:
pdfFileObj = open(file,'rb') #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
startPage = 0
text = ''
cleanText = ''
while startPage <= pdfReader.numPages -1:
pageObj = pdfReader.getPage(startPage)
text += pageObj.extractText()
startPage += 1
pdfFileObj.close()
for myWord in text:
if myWord != '\n':
cleanText += myWord
text = cleanText.split()
newRow = pd.DataFrame(index = [0], columns = ['FileName', 'Text'])
newRow.iloc[0]['FileName'] = file
newRow.iloc[0]['Text'] = text
output_data = pd.concat([output_data, newRow], ignore_index=True)
and getting data in symbols only
FileName Text
0 NaN NaN
1 C:/Users/Arush/OneDrive/Desktop/sample\Introdu... [Andreas, C.Müller, &, Sarah, Guido˜˚˛˝˙ˆˇ˘˛˙...
2 C:/Users/Arush/OneDrive/Desktop/sample\Machine... [áâáâÞ;áâáâÞ;;áâáâáâáâç...
moreover I think it only fetching 1 page
can you guy please help me ?

How can I iterate through excel files sheets and insert formula in Python?

I get this error
TypeError: 'Workbook' object is not subscriptable
when i run this code
import xlsxwriter
from openpyxl import load_workbook
in_folder = r'xxx' #Input folder
out_folder = r'xxx' #Output folder
if not os.path.exists(out_folder):
os.makedirs(out_folder)
file_exist = False
dir_list = os.listdir(in_folder)
for xlfile in dir_list:
if xlfile.endswith('.xlsx') or xlfile.endswith('.xls'):
file_exist = True
str_file = os.path.join(in_folder, xlfile)
work_book = xlsxwriter.Workbook(filename=str_file)
work_sheet = work_book['test1'] #error above is thrown here
work_sheet.write_formula('C2', '=A2+B2') #Add formular but not sure of how to apply it to the entire column.
out_Path = os.path.join(out_folder,work_book)
Edit:
I managed to figure out the above and using this code:-
work_book = openpyxl.load_workbook(os.path.join(in_folder,xlfile))
work_sheet = work_book['test1']
However, the issue formulas still exists in the new code below:-
from openpyxl import load_workbook
in_folder = r'xxx' #Input folder
out_folder = r'xxx' #Output folder
if not os.path.exists(out_folder):
os.makedirs(out_folder)
file_exist = False
dir_list = os.listdir(in_folder)
for xlfile in dir_list:
if xlfile.endswith('.xlsx') or xlfile.endswith('.xls'):
str_file = xlfile
work_book = openpyxl.load_workbook(os.path.join(in_folder,str_file))
work_sheet = work_book['Sheet1']
row_count = work_sheet.max_row
for row in work_sheet.iter_rows(min_row=1, min_col=1, max_row=work_sheet.max_row):
print(row_count)
for i, cellObj in enumerate(work_sheet['U'], 2):
cellObj.value = f'=Q{row_count}-T{row_count}'
work_book.save(os.path.join(out_folder, xlfile))
Ideally, I would like to loop through a folder with .xlsx files, add a formular and apply it to the entire column (U). In this case, I would like to save the files(with the formula effected) in another folder(out_folder).
Documentation for xlsxwriter.Workbook shows
work_book.get_worksheet_by_name('test1')
Maybe openpyxl or other module could use ['test1']

How to enumerate some text files using python?

I want to make a program to automate excel task using Openpyxl. I am using "enumerate" to open some text files and then auto input to excel file.
import os
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
for i, filename in enumerate(filenames):
file = open(filename, 'r')
text = file.read().replace('\u2014', '-').replace('—', '-')
start = 0
startcheck = True
end = 0
endcheck = True
for idx, letter in enumerate(text):
if text[idx:idx+4] == 'NPWP' and startcheck:
start = idx + 7
startcheck = False
if text[idx:idx+7] == 'Pembeli' and endcheck:
end = idx
endcheck = False
data = text[start:end]
from openpyxl import load_workbook
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
wb.save(filename = r'F:\tes\Form 1771.xlsx')
I've tried to open 2 text files but it only open and input 1 text file to excel. How do I code it to open multiple text files?
Relocate some of the statements and ensure statements are in the appropriate loops (correct indentation). I have not tested this but it looks correct.
First all imports and setup at the beginning.
import os
from openpyxl import load_workbook
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
Then start the iteration.
# outer loop
for i, filename in enumerate(filenames):
file = open(filename, 'r')
text = file.read().replace('\u2014', '-').replace('—', '-')
# explicitly close the file
file.close()
start = 0
startcheck = True
end = 0
endcheck = True
# find the data
# inner loop
for idx, letter in enumerate(text):
if text[idx:idx+4] == 'NPWP' and startcheck:
start = idx + 7
startcheck = False
if text[idx:idx+7] == 'Pembeli' and endcheck:
end = idx
endcheck = False
# ensure this section in the outer loop
data = text[start:end]
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
Finally save the workbook. Indentation ensures it is saved after all data has been written to it.
wb.save(filename = r'F:\tes\Form 1771.xlsx')
It is probably best to open a file using the with keyword which will ensure that the file is closed.
with open(filename, 'r') as f:
text = f.read().replace('\u2014', '-').replace('—', '-')
In your example you iterate over each character in the file using enumerate to find the index of the start and end of your data, text[idx:idx+4] == 'NPWP'.
strings have a find method that will do that for you.
start = text.find('NPWP')
end = text.find('Pembeli', start)
data = text[start:end]
With these changes your code would look like this:
import os
from openpyxl import load_workbook
os.chdir(r'F:\tes')
filenames = ["eb.txt", "ea.txt"]
wb = load_workbook(filename = r'F:\tes\Book1.xlsx')
for i, filename in enumerate(filenames):
with open(filename, 'r') as f:
text = f.read().replace('\u2014', '-').replace('—', '-')
start = text.find('NPWP')
end = text.find('Pembeli', start)
data = text[start:end]
sheet_ranges = wb['1771 III']
cell_name = 'M' + str(20 + (3*i))
sheet_ranges[cell_name] = data
wb.save(filename = r'F:\tes\Form 1771.xlsx')

Categories