How to speed up this search script? - python

IHello,
I have created a python script which aims to complete an excel file (wb) thanks to the first column of this file composed of many references (about 4000). To complete this excel, my script must search each reference (so use a for loop of list references from reading wb file) in two other excel files transformed into dataframe (df_mbom and df_ebom) and fill the specific cells of wb according to the presence or not of the references in df_mbom and df_ebom. If the reference is found, it is necessary to compare the level of the reference and the following line and fill wb accordingly. The created script works very well and it does the job very well.
But the only problem I have is that it takes more than 6 hours to search and fill wb for 1000 references so to process the 4000 references, it would take almost 24 hours! Do you have any suggestions to speed up this program?
Here is the code used:
from multiprocessing.dummy import Pool
def finding_complete(elt):
elt = str(elt)
pos = mylist_ref.index(elt)
print(pos)
item = r'^' + elt + '$'
df_findings = df_mbom[df_mbom['Article'].str.contains(item, case=True, regex=True)]
if df_findings.shape[0] == 0 :
active_sheet.cell(row = 4+pos, column = 19).value = "NOK"
active_sheet.cell(row = 4+pos, column = 18).value = "NOK"
else :
active_sheet.cell(row = 4+pos, column = 19).value = "OK"
boolean_f = df_findings.drop_duplicates(subset = ['Article'],keep = 'first')
ind = boolean_f.index.to_list()
idx = ind[0]
item1 = df_mbom['Niveau'][idx]
item2 = df_mbom['Niveau'][idx + 1]
if item2 > item1 :
active_sheet.cell(row = 4+pos, column = 18).value = "OK"
else :
active_sheet.cell(row = 4+pos, column = 18).value = "NOK"
df_findings2 = df_ebom[df_ebom['Article'].str.contains(item, case=True, regex=True)]
pos = mylist_ref.index(elt)
if df_findings2.shape[0] == 0 :
active_sheet.cell(row = 4+pos, column = 17).value = "NOK"
else :
boolean_f = df_findings2.drop_duplicates(subset = ['Article'],keep = 'first')
ind = boolean_f.index.to_list()
idx = ind[0]
item1 = df_ebom['Niveau'][idx]
item2 = df_ebom['Niveau'][idx + 1]
if item2 > item1 :
active_sheet.cell(row = 4+pos, column = 17).value = "OK"
else :
active_sheet.cell(row = 4+pos, column = 17).value = "NOK"
if __name__ == '__main__':
start = time.time()
path = '100446099_mbom.xlsx'
df_mbom = pd.read_excel(path, sheet_name=0, header=0)
path = '100446099_ebom.xlsx'
df_ebom = pd.read_excel(path, sheet_name=0, header=0)
location = 'DOC#6TERNORrev0.xlsx'
wb = openpyxl.load_workbook(filename=location) #, data_only=True"
active_sheet = wb["DOC#6 toutes regions"]
#Get cell value and put it in a list
mylist_ref = []
for row in active_sheet.iter_rows(min_row=4, max_row=active_sheet.max_row, min_col=2, max_col=2):
for cell in row:
if cell.value == None :
pass
else:
mylist_ref.append(cell.value)
print("Number of references :")
print(len(mylist_ref))
print(" ")
with Pool() as pool: #os.cpu_count())
pool.map(finding_complete,mylist_ref) # correspond à for elt in mylist_ref: do finding_complete
wb.save(location)
wb.close()
final = time.time()
timer = final - start
print(round(timer, 1))
Thanks in advance for your time.

convert the Excel file to json, procces the json, then write it to Excel.

Related

How to pick values for specific times in a date (large list of date, time, value)

I have a file with these columns: date, times, and value of a stock. Basically, per-minute value of stocks. I would like to calculate the difference in the value of a stock at 10 AM and 4 PM. This is the code I have so far:
fileName = "C:\\...\\US_200901_210907.csv"
with open(fileName) as f:
for line in f.readlines()[1:]:
split = line.split(";")
time = split[3]
date = split[2]
for timev in f.readlines()[1:]:
if timev == '100000':
Spot = float(split[2])
elif timev == '160000':
Close = float(split[2])
Diff = Spot - Close
print(Diff)
I am not sure if I am doing this right. But the code needs to cycle/loop through each date first, find the value of the stock at '100000' and '160000' and then calculate the difference between the two. Then move to the next day. And at the end of all days, print the differences for each day.
The "Diff = Spot - Close" line also gives me an error, says "NameError: name 'Spot' is not defined"
Any help is appreciated.
Dataset looks like this (extract):
====================
After working more on this on my own, I was able to get this to work:
import csv
filename = "C:\\...\\US_200901_210907.csv"
with open(filename, 'r') as f:
reader = csv.reader(f, delimiter=';')
next(reader, None) # skip header
rows = list(reader)
listOfDates = []
index = 0
for row in rows:
if rows[index][2] not in listOfDates:
listOfDates.append(rows[index][2])
index = index + 1
print(listOfDates)
startPrice = 0
endPrice = 0
index = 0
startPriceSet = False
endPriceSet = False
for date in listOfDates:
for row in rows:
if rows[index][2] == date:
# print(rows[index][2])
# print(date)
if rows[index][3] == '100000':
startPrice = float(rows[index][7])
startPriceSet = True
elif rows[index][3] == '160000':
endPrice = float(rows[index][7])
endPriceSet = True
index = index + 1
if startPriceSet and endPriceSet:
print(date, startPrice, endPrice, startPrice - endPrice)
startPriceSet = False
endPriceSet = False
Why not leverage a pandas DataFrame for this calculation -
import pandas as pd
df = pd.read_csv("C:\\...\\US_200901_210907.csv")
# give appropriate column names before or after loading the data
# assuming we have the columns 'time', 'date' & 'stockvalue' in df
# might have to use pandas.to_datetime
print(df[(df['time']=='time1') && (df['date']=='date1')]['stockvalue']-df[(df['time']=='time2') && (df['date']=='date1')]['stockvalue'])
Also, why do you have an embedded for loop?
One of the approach with the sheet you have provided:
import pandas as pd
from collections import defaultdict
df = pd.read_excel("Data.xlsx", header=None, dtype='str')
out = defaultdict(lambda: defaultdict(float))
for rowindex, row in df.iterrows():
date = row[2]
name = row[0]
if row[3] == "100000":
out[name]['DATE'] = row[2]
out[name]['START'] = float(row[4])
if row[3] == "160000":
out[name]['END'] = float(row[4])
for stock, data in out.items():
print (stock+': DATE: '+data['DATE']+' START: '+data['START']+' END:'+data['END']+' diff = '+str(int(data['END']-data['START'])))

How to put docx content in dataframe columns?

Below is my code:
if t.endswith('.docx'):
def get_files(extension, location):
v_doc = []
for root, dirs, files in os.walk(location):
for t in files:
if t.endswith(extension):
v_doc.append(t)
return v_doc
file_list = get_files('.docx', paths)
#print(file_list)
index = 0
for file in file_list:
index += 1
doc = Document(file)
#print(doc)
column_label = f'column{index}'
data_content = doc.paragraphs
final = []
for f in data_content:
final.append(f.text)
new = [x for x in final if x]
#j = {column_label: new}
#print(j)
df_last = pd.DataFrame(new, columns=
[column_label])
df_last.to_excel('output_dummy.xlsx')
But i get following problem:
column2:
#hello how are you guys?
#i hope you are all doing fine
expected dataframe output:
column1: column2:
#This column is getting replaced by column 2 #hello how are you guys?
#some random dummy text #i hope you are all doing fine
docx1 contans:
#This column is getting replaced by column 2
#some random dummy text
docx2 conatins:
#hello how are you guys?
#i hope you are all doing fine
i know its a silly question. where am i doing this mistake ?
I found the answer.
Repeat f'column{index}' also for .doc and .excel to
f'column{index+index2}'.
#index2 is for docx or excel like previous one.
for file2 in file_list2:
file2 = 'datas/'+file2
index2 += 1
column_label2 = f'seller{index2}'
df = pd.read_excel(file2, header=None, index_col=False)
for l in df.values:
for s in l:
g.append(s)
t = [incom for incom in g if str(incom) != 'nan']
for s in t:
final.append({column_label2: s})
index = 0
for file in file_list:
file = 'datas/'+file
index += 1
doc = Document(file)
column_label = f'seller{index+index2}'
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
new_list = [p.text for p in cell.paragraphs if p.text not in ['5','3','0.1%', '1%','1',
'Bill','Number' ]]
for s in new_list:
final.append({column_label: s})
y = [d.text for d in doc.paragraphs if d.text not in ['5','3','0.1%', '1%', '1',
'Number']]
for k in y:
final.append({column_label: k})

Remove empty rows based on ",,,,,,," values in the first column out of appended csv's (pandas)

I want to pass a dataframe that's based on serveral joined csv's to another method.
col0 = ["Dov"]
values = A, B
But there are some csv's that only contains ",,,,,,,," in different length in col0 at the end. 20 to 50 rows. Lenght of strange values are the same over the file but different from file to file. Rest of these columns are only filled in the regular ones.
I've played with loc, drop and dropna. Struggling for a while and searched at stackoverflow. Still no clue.
def dataImport():
header_appended = False
concat_i = []
progress = len(linkGenerator())
count = 1
print("0 of " + str(progress))
for i in list(linkGenerator()):
if header_appended != True:
print("Appending : " + str(i))
df_raw = pd.read_csv(i, header = 0, sep = ",", engine = "python", encoding = "ISO-8859-1", index_col=False )
header_appended = True
print("Appended.")
time.sleep(2)
else:
print("Appending : " + str(i))
df_internal = pd.read_csv(i, sep = ",", engine = "python", encoding = "ISO-8859-1", index_col=False )
concat_i.append(df_internal)
print("Appended.")
time.sleep(2)
print(str(count) + " of " + str(progress))
count = count + 1
df_raw = pd.concat(concat_i, ignore_index = True)
return df_raw
Works so far. Only the part after df_raw = pd.concat(concat_i, ignore_index = True) is my problem.
The code I did could do this:
Hope this helps. The str category of functions are pretty powerful within pandas. I don't what is the minimum number of "," you are seeing and also col0 needs to be str/object type.
Thx. for trying to help me. Sadly did not work.
df_raw = df_raw[~df_raw["Div"].str.match(",,,,,,,,,,,,,,,,,,,,,,,,,,,", na = False)]
# df_raw = df_raw.drop(df_raw["Div"] != 0, inplace = True)
# df_raw = df_raw.drop(["Div"] == "")
# df_raw = df_raw.loc[df_raw["Div"] != ""]
# df_raw = df_raw[(df_raw["Div"] != ",,,,,,") | (df_raw["Div"] != ",,,,,,,,,,,,,,,,,,,,,,,,,,,,")].copy()
Result every time:
Start
End
Got it! Solution was:
df_raw.dropna(subset = ["Div"], inplace = True)

Extra Comma in specific data field using pandas

I am combining very large data sets using python. The script works completely fine. However there is one specific row that may or may not have a comma in side of it. Does anyone know how to remove the comma? FYI this is how the data is collected it cannot be removed on collection. The field that it is in is the ["NAME'] field.
I have tried to implement a sep=r',(?!\s)' look ahead and that screws my data up even more
THANKS!
import csv
import shutil
import os
import pandas as pd
from os import path
def combinecsv(source_folder):
all_files = os.listdir(source_folder)
master_df = None
for anyfile in all_files:
if anyfile.lower().endswith(".csv"):
file_path = path.join(source_folder, anyfile)
print("opening file path: {}".format(file_path))
df = pd.read_csv(file_path)
if master_df is None:
master_df = df
else:
master_df = master_df.append(df)
new_df = pd.DataFrame()
new_df["MSG_TYPE"] = master_df["MSG_TYPE"]
new_df["MMSI"]= master_df["MMSI"]
new_df["NAME"]= master_df.apply(lambda row: check_for_none(row["NAME"]), axis = 1)
new_df["LAT_AVG"]= master_df["LAT_AVG"]
new_df["LON_AVG"]= master_df["LON_AVG"]
new_df["PERIOD"]= master_df.apply(lambda row: convert_period(row["PERIOD"]),axis = 1)
new_df["SPEED_KNOTS"]= master_df.apply(lambda row: check_for_none(row["SPEED_KNOTS"]), axis = 1)
new_df["COG_DEG"]= master_df.apply(lambda row: check_for_none(row["COG_DEG"]), axis = 1)
new_df["SHIP_AND_CARGO_TYPE"]= master_df.apply(lambda row: check_for_none(row["SHIP_AND_CARGO_TYPE"]), axis = 1)
new_df["DRAUGHT"]= master_df.apply(lambda row: check_for_none(row["DRAUGHT"]), axis = 1)
new_df["LEN"]= master_df.apply(lambda row: combine_bowstern(row["DIM_BOW"],row["DIM_STERN"]), axis = 1)
# axis traverses rows not columns
new_folder = path.join(source_folder, "output")
if not path.exists(new_folder):
os.mkdir(new_folder)
new_csvpath = path.join(new_folder, "output.csv")
print("saving csv to {}".format(new_csvpath))
new_df.to_csv(new_csvpath, index=False, quoting = csv.QUOTE_NONNUMERIC)
def check_for_none(df):
if (df) == 'None':
return ""
else:
return (df)
def convert_period(period):
y = str(period[2:4])
m = str(period[5:7])
d = str(period[8:10])
t = str(period[11:16])
periodnewformat = "{}/{}/{} {}".format(d,m,y,t)
return periodnewformat
def combine_bowstern(bow, stern):
bow_int = 0
stern_int = 0
if bow !="None":
bow_int = int(bow)
if stern !="None":
stern_int = int(stern)
return bow_int + stern_int
if __name__ == "__main__":
source_folder = r'C:\Users\MTTA Standalone\Desktop\Code\csvcombine'
combinecsv(source_folder)
Here is a sample of with and without comma data set:
MSG_TYPE,MMSI,NAME,IMO_NUMBER,CALL_SIGN,LAT_AVG,LON_AVG,PERIOD,SPEED_KNOTS,COG_DEG,HEADING_DEG,NAV_STATUS,NAV_SENSOR,SHIP_AND_CARGO_TYPE,DRAUGHT,DIM_BOW,DIM_STERN,DIM_PORT,DIM_STARBOARD,MMSI_COUNTRY_CD,RECEIVER
1,249830000,ZIM LUANDA,9403229,9HA2029,37.825850,-74.340755,2018-08-01 00:00:00.000,11.5,196.4,198,0,1,71,10.9,197,63,21,11,MT,D05MN-HR-CHIBS1
1,256819000,IOLCOS, DESTINY,9486049,9HA2936,36.833089,-75.672449,2018-08-01 00:00:00.000,9.7,93.1,95,0,1,70,14.4,199,30,13,24,MT,D05MN-NC-MAMBS1

Multi-threading list iterating for loop

this function reads from a text file and re-formats the contents, and then writes the contents to a csv. I'm trying to use threading to multi-thread the for i in lines loop, this is the longest part of a larger script and takes up most of the run time because the list lines contains thousands of elements. Can someone help me straighten this out? Doing this synchronously instead of in parallel is taking up tons of time. I have seen many other answers to similar questions but I've yet to understand the answers and implement them correctly so far.
def sheets(i):
# time format for spreadsheet
dt_time = datetime.now().strftime('%m/%d|%H:%M')
# for league name (NFL,NBA,NHL ETC.) in list containing league names
for league_name in leagues2:
league_name = league_name.split('|')[0]
with open(final_stats_path, 'r+') as lines:
lines = lines.readlines()
# i = one long string containg details about the event in the loop, eg. sport, game day, game id, home team name
for i in lines:
i = i.split(',')
minprice = i[6]
totaltix = i[5]
event_date = i[2]
try:
dayofweek = datetime.strptime(event_date, '%Y-%m-%d').strftime('%A')
except:
continue
event_date = i[2][2:]
event_date = str(event_date).split('-')
event_date = event_date[1]+'/'+event_date[2]
sport = i[4]
event = i[1].replace('Basketball','').replace('\n','')
away = i[8].replace('Basketball', '').replace('\n','')
eventid = i[0]
event_home = i[9].replace('Basketball', '').replace('\n','')
event = event.split(' at ')[0]
tixdata = str(totaltix)
eventid = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId='+str(eventid)+'&sectionId=0'
directory = root+'\data'+'\\'+sport+'\\'
report = directory+'report.xlsx'
fname = directory+'teams.txt'
eventleague = sport
f = open(directory+'acronym.txt', 'r+')
lines_2 = f.readlines()
for qt in lines_2:
qt = qt.split('-')
compare = qt[1]
if event_home in compare:
event_home = qt[0]
else:
pass
troop = []
d = {
'ID' : eventid,
'Date' : event_date,
'Day' : dayofweek,
'Away' : away,
}
s = {
'time' : tixdata
}
numbers = event_home+'.txt'
numbers_new = 'bk\\bk_'+numbers
with open(directory+numbers_new, 'a+') as y:
pass
with open(directory+numbers, 'a+') as o:
pass
with open(directory+numbers, 'r+') as g:
for row in g:
if str(eventid) in row:
#print('the event is in the list')
row_update = row.replace('}', ", '"+dt_time+"': '"+tixdata+"'}")
with open(directory+numbers_new, 'a+') as y:
y.write(row_update)
break
else:
with open(directory+numbers, 'a+') as p:
#print('the event is not in the list')
p.write(str(d)+'\n')
with open(directory+numbers_new, 'a+') as n:
n.write(str(d)+'\n')
sizefile = os.path.getsize(directory+numbers_new)
if sizefile > 0:
shutil.copy(directory+numbers_new, directory+numbers)
open(directory+numbers_new, 'w').close()
else:
pass
df = []
with open(directory+numbers, 'r+') as t:
for row in t:
b = eval(row)
dfs = df.append(b)
df = pd.DataFrame(df)
yark = list(df.columns)[:-5]
zed = ['ID', 'Date', 'Day', 'Away']
columns = zed+yark
try:
df = df[columns]
except:
pass
df.index = range(1, 2*len(df)+1, 2)
df = df.reindex(index=range(2*len(df)))
writer = pd.ExcelWriter(directory+event_home+'.xlsx', engine='xlsxwriter')
try:
df.to_excel(writer, sheet_name=event_home)
except:
continue
workbook = writer.book
worksheet = writer.sheets[event_home]
format1 = workbook.add_format({'num_format': '#,##0.00'})
worksheet.set_column('A:ZZ', 18, format1)
writer.save()
if __name__ == "__main__":
pool = ThreadPool(8) # Make the Pool of workers
results = pool.map(sheets) #Open the urls in their own threads
pool.close() #close the pool and wait for the work to finish
pool.join()
##get_numbers()
##stats_to_csv()
##stats_to_html()
#sheets()
Try changing the following line:
results = pool.map(sheets)
to:
results = pool.map(sheets,range(8))

Categories