How to put docx content in dataframe columns? - python

Below is my code:
if t.endswith('.docx'):
def get_files(extension, location):
v_doc = []
for root, dirs, files in os.walk(location):
for t in files:
if t.endswith(extension):
v_doc.append(t)
return v_doc
file_list = get_files('.docx', paths)
#print(file_list)
index = 0
for file in file_list:
index += 1
doc = Document(file)
#print(doc)
column_label = f'column{index}'
data_content = doc.paragraphs
final = []
for f in data_content:
final.append(f.text)
new = [x for x in final if x]
#j = {column_label: new}
#print(j)
df_last = pd.DataFrame(new, columns=
[column_label])
df_last.to_excel('output_dummy.xlsx')
But i get following problem:
column2:
#hello how are you guys?
#i hope you are all doing fine
expected dataframe output:
column1: column2:
#This column is getting replaced by column 2 #hello how are you guys?
#some random dummy text #i hope you are all doing fine
docx1 contans:
#This column is getting replaced by column 2
#some random dummy text
docx2 conatins:
#hello how are you guys?
#i hope you are all doing fine
i know its a silly question. where am i doing this mistake ?

I found the answer.
Repeat f'column{index}' also for .doc and .excel to
f'column{index+index2}'.
#index2 is for docx or excel like previous one.
for file2 in file_list2:
file2 = 'datas/'+file2
index2 += 1
column_label2 = f'seller{index2}'
df = pd.read_excel(file2, header=None, index_col=False)
for l in df.values:
for s in l:
g.append(s)
t = [incom for incom in g if str(incom) != 'nan']
for s in t:
final.append({column_label2: s})
index = 0
for file in file_list:
file = 'datas/'+file
index += 1
doc = Document(file)
column_label = f'seller{index+index2}'
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
new_list = [p.text for p in cell.paragraphs if p.text not in ['5','3','0.1%', '1%','1',
'Bill','Number' ]]
for s in new_list:
final.append({column_label: s})
y = [d.text for d in doc.paragraphs if d.text not in ['5','3','0.1%', '1%', '1',
'Number']]
for k in y:
final.append({column_label: k})

Related

Sorting and coloring excel file via python

import pandas as pd
from fuzzywuzzy import fuzz
import openpyxl
from itertools import combinations
from difflib import SequenceMatcher
import json
def readExcel():
df = pd.read_excel('tedarikci.xlsx')
# Tüm isimleri bir Python listesine ata
isimler = df['Müşteri/tedarikçi ismi'].tolist() #names
# output_list = []
# İsimleri karşılaştır
def searchInDict(dataDict,item):
if item in dataDict.keys():
return False
else:
for parent in dataDict.keys():
if item in dataDict[parent]:
return False
return True
benzerIsimler = {} #similarNames
print(isimler) #names
counter = 0
for i,item in enumerate(isimler):
print(i)
if searchInDict(benzerIsimler,item):
print(item)
benzerIsimler[item] = []
for j,item2 in enumerate(isimler):
if item != item2:
score = SequenceMatcher(None, item, item2).ratio()
if score > 0.9:
benzerIsimler[item].append(item2)
for i in benzerIsimler.keys():
print(benzerIsimler[i])
with open("dataMusteri.json", "w") as outfile:
json.dump(benzerIsimler, outfile)
def printExcel():
benzerIsimler = None
with open("dataMusteri.json", "r") as outfile:
benzerIsimler = json.load(outfile)
dataframe = openpyxl.load_workbook("tedarikci.xlsx")
# Define variable to read sheet
dataframe1 = dataframe.active
sortedRows = []
for item in benzerIsimler.keys():
print(item)
for row in dataframe1.iter_rows():
if row[0].value == item :
sortedRows.append(row)
break
for item2 in benzerIsimler[item]:
for row in dataframe1.iter_rows():
if row[0].value == item :
sortedRows.append(row)
break
print(sortedRows)
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["Müşteri/tedarikçi ismi","E-posta","Müşteri/tedarikçi grubu","TL Bakiye ", "USD Bakiye", "EUR Bakiye", "GBP Bakiye", "Adres", "İl", "İlçe", "Telefon", "Faks", "Vergi dairesi", "Vergi numarası/TC kimlik no"])
for row in sortedRows:
row_with_values = [cell.value for cell in row]
ws.append(row_with_values)
wb.save("dilara.xlsx")
printExcel()
I am trying to sort the similar names on an excel file and after sorting I want to color the similar/same ones with the same color.
The code above is what I've done so far. I can sort but I can't paint the same/similar rows, my code paints all rows with the same color. Do you have any suggestion?

How to speed up this search script?

IHello,
I have created a python script which aims to complete an excel file (wb) thanks to the first column of this file composed of many references (about 4000). To complete this excel, my script must search each reference (so use a for loop of list references from reading wb file) in two other excel files transformed into dataframe (df_mbom and df_ebom) and fill the specific cells of wb according to the presence or not of the references in df_mbom and df_ebom. If the reference is found, it is necessary to compare the level of the reference and the following line and fill wb accordingly. The created script works very well and it does the job very well.
But the only problem I have is that it takes more than 6 hours to search and fill wb for 1000 references so to process the 4000 references, it would take almost 24 hours! Do you have any suggestions to speed up this program?
Here is the code used:
from multiprocessing.dummy import Pool
def finding_complete(elt):
elt = str(elt)
pos = mylist_ref.index(elt)
print(pos)
item = r'^' + elt + '$'
df_findings = df_mbom[df_mbom['Article'].str.contains(item, case=True, regex=True)]
if df_findings.shape[0] == 0 :
active_sheet.cell(row = 4+pos, column = 19).value = "NOK"
active_sheet.cell(row = 4+pos, column = 18).value = "NOK"
else :
active_sheet.cell(row = 4+pos, column = 19).value = "OK"
boolean_f = df_findings.drop_duplicates(subset = ['Article'],keep = 'first')
ind = boolean_f.index.to_list()
idx = ind[0]
item1 = df_mbom['Niveau'][idx]
item2 = df_mbom['Niveau'][idx + 1]
if item2 > item1 :
active_sheet.cell(row = 4+pos, column = 18).value = "OK"
else :
active_sheet.cell(row = 4+pos, column = 18).value = "NOK"
df_findings2 = df_ebom[df_ebom['Article'].str.contains(item, case=True, regex=True)]
pos = mylist_ref.index(elt)
if df_findings2.shape[0] == 0 :
active_sheet.cell(row = 4+pos, column = 17).value = "NOK"
else :
boolean_f = df_findings2.drop_duplicates(subset = ['Article'],keep = 'first')
ind = boolean_f.index.to_list()
idx = ind[0]
item1 = df_ebom['Niveau'][idx]
item2 = df_ebom['Niveau'][idx + 1]
if item2 > item1 :
active_sheet.cell(row = 4+pos, column = 17).value = "OK"
else :
active_sheet.cell(row = 4+pos, column = 17).value = "NOK"
if __name__ == '__main__':
start = time.time()
path = '100446099_mbom.xlsx'
df_mbom = pd.read_excel(path, sheet_name=0, header=0)
path = '100446099_ebom.xlsx'
df_ebom = pd.read_excel(path, sheet_name=0, header=0)
location = 'DOC#6TERNORrev0.xlsx'
wb = openpyxl.load_workbook(filename=location) #, data_only=True"
active_sheet = wb["DOC#6 toutes regions"]
#Get cell value and put it in a list
mylist_ref = []
for row in active_sheet.iter_rows(min_row=4, max_row=active_sheet.max_row, min_col=2, max_col=2):
for cell in row:
if cell.value == None :
pass
else:
mylist_ref.append(cell.value)
print("Number of references :")
print(len(mylist_ref))
print(" ")
with Pool() as pool: #os.cpu_count())
pool.map(finding_complete,mylist_ref) # correspond à for elt in mylist_ref: do finding_complete
wb.save(location)
wb.close()
final = time.time()
timer = final - start
print(round(timer, 1))
Thanks in advance for your time.
convert the Excel file to json, procces the json, then write it to Excel.

writing Excel file while using for loop

I am trying to write data to an Excel file, during a for loop.
But what I am getting is a single line containing the last data received by the loop.
I have tried a couple of different methods but came short..
2 tries are list below
Any Ideas ?
def write_excel(x):
workbook = xlsxwriter.Workbook('ID_Num.xlsx')
worksheet = workbook.add_worksheet()
df = pd.DataFrame(
{'ID':[x],
'mail_one':[Email],
'second_mail':[second_mail],
'Num':[Num],
'date':[Date]})
row_num = 0
for key, value in df.items():
worksheet.write(0, row_num, key)
worksheet.write_row(1, row_num, value)
row_num += 1
workbook.close()
#df = pd.DataFrame(
# {'ID':[x],
# 'mail_one':[Email],
# 'second_mail':[second_mail],
# 'Num':[Num],
# 'date':[Date]})
# writer = ExcelWriter('ID_Num.xlsx')
# df.to_excel(writer,'ID_Num',index=False)
# writer.save()
if __name__ == "__main__":
for x in List:
my_dic = {}
my_dict["ID"] = x
my_dict["mail_one"] = Email
my_dict["second_mail"] = second_mail
my_dict["Num"] = str(Num)
my_dict["date"] = Date
print(my_dict)
write_excel(x)
I don't have xlsxwriter so I cannot test. The documentation says that it cannot modify an existing file so I suspect that every iteration of for x in List: you are over-writing your file (workbook = xlsxwriter.Workbook('ID_Num.xlsx')).
You can make multiple files with these changes:
def write_excel(x,i):
workbook = xlsxwriter.Workbook(f'ID_Num{i}.xlsx')
...
# and
for i,x in enumerate(List):
...
write_excel(x,i)
Or you could accumulate multiple dictionaries and pass all of them to your function
data = []
for x in List:
my_dic = {}
...
data.append(my_dic)
write_excel(data)
Changing the function to iterate over those dicts; making a new sheet for each one
def write_excel(data):
workbook = xlsxwriter.Workbook('ID_Num.xlsx')
for sht in data:
worksheet = workbook.add_worksheet()
df = pd.DataFrame(...
row_num = 0
for key, value in df.items():
worksheet.write(...
worksheet.write_row(...
row_num += 1
workbook.close()

Problem extracting content from text files using Python

I am trying to capture the data here in the second table (Field crops) titled "Prices Received, United States,July 2010, with Comparisons". I am using Panda dataframes to capture the table from the text file and then I will output it to a CSV file.
My code is as follows
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if 'Dollars' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
def take_table(txt_data):
comodity = []
q = []
w = []
e = []
t = []
p = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6 = data_row
comodity.append(col_1)
q.append(col_2)
w.append(col_3)
e.append(col_4)
t.append(col_5)
p.append(col_6)
table_data = {'comodity': comodity, 'q': q,
'w': w, 'e': e, 't': t}
return table_data
And, then I am doing this:
import requests
import pandas as pd
txt_data = requests.get("https://downloads.usda.library.cornell.edu/usda-esmis/files/c821gj76b/6w924d00c/9z903130m/AgriPric-07-30-2010.txt").text
splited_data = txt_data.split('\n')
table_title = 'Prices Received, United States'
END_TABLE_LINE = '-------------------------------------------'
_, table_start,_ = find_no_line_start_table(table_title,splited_data)
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
dict_table = take_table(txt_data)
pd.DataFrame(dict_table)
c = pd.DataFrame(dict_table)
IndexError: list assignment index out of range
However, I am getting an error here. Can anyone help me figure out what I am doing wrong?
Cause of error:
data_row is a list of 6 elements.
number_columns = 6
# ...
data_row = [''] * number_columns # [''] * 6
and index will increment with each iteration where first_column_done = True. But first_column_done will be True when : is encountered in a word, i.e
if ':' in w:
first_column_done = True
hence, for each iteration after first_column_done turns True, index will increment until it gets more than 6 which is the bound of list data_row.
def row(l):
l = l.split()
number_columns = 6
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w # error pos.
In other words, U get this error for each line that contains a number of words greater than 6 - index after the first occurence of : within a word in that line.
Fix:
Use split(':') and list comprehension as well as python tertiary operator.
def row(l):
row = [ col.strip() for col in l.split(':') ]
row[2:] = row[2].split()
return [ row[i] if i < len(row) else '' for i in range(6) ]

Creating a histogram with .txt file using Python?

Here's the code I tried. It gave me a syntax error highlighting 'data'. Any help? The .txt file has 4 columns if that's of any help.
def file():
file = open('hsp.txt', 'r')
col = [] data = file.readlines()
for i in range(1,len(data)-1):
col.append(int(float(data[i].split(',')[5])))
return col
def hist(col):
handspan = []
for i in range(11):
handspan.append(0)
for i in (col):
handspan[i] += 1
return handspan
col = file()
handspan = hist(col)
print(col)
print(handspan)
It is because your line
col = [] data = file.readlines()
should be on two separate lines:
col = []
data = file.readlines()
You can try this, it worked for me.
Hence it is a histogram it yields a dictionary.
Better answers are welcome!
import string
def list_from_file(filename):
myfile = open(filename, 'r')
data = myfile.read().split()
col = []
for word in data:
col.append(word)
return col
def myhist(col):
hist = {}
for word in col:
word = word.lower()
word = word.strip(string.punctuation + string.whitespace)
hist[word] = hist.get(word, 0)+1
return hist
col = list_from_file('em.txt')
colf = myhist(col)
print(colf)

Categories