Speed up pd.read_excel in python - python

i'm writing python that read excel data and import it to database. For 10,000~30,000 records is fine. But 150,000+ records it took me over 13 seconds. How i can speed up it ?
f = request.files['file']
all_data = {} #insert group data
df = pd.read_excel (f)
df = df.dropna(how='all')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.index.name = excel_conf['config']['default_config']['identification_column']['name'] #column header identification
df.index += 1 #raise index to a valid one
df = df.fillna("")
########## loop take times #########
for index, row in df.iterrows():
row_dict = []
for key in excel_conf['config']['column_config']: #column header name lists
row_dict.append({
key : row[key]
#key (from excel config) row[key] (row value from excel)
})
index_key = index_keygen(create_blake2s_signature(...)) #just create index key stuffs, i shorted it
# add child data to main
all_data[index_key] = row_dict
#"test_key" : { "key":"value",... }
####################################
insert_db(all_data) #this is fast

Related

Show excel data in only one Sheet

I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one

Add every scraped item to csv row pandas

I have a selenium project that scrape website and loop to get inner class text
I want to save every scraped text from this loop to a new csv row located next to the py file, and accept new columns if added in the future
How do i do that?
This is what i tried
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
for pTitle in prodTitle:
itemName = pTitle
pd = pd.dataframe(pTitle.text)
pd.to_csv('data.csv', pd)
print(pTitle.text)
but it add the last item only
You can add the data in the same loop and then save the whole dataframe, like this:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
df = pd.DataFrame(columns=['Title'])
for (idx,pTitle) in enumerate(prodTitle):
itemName = pTitle
df.loc[idx, 'Title'] = pTitle.text
print(pTitle.text)
df.to_csv('data.csv')
EDIT: to add more data it is convenient set the column before the loop, like this:
cols = ['Title', 'Col_0', 'Col_1', 'Col_N']
df = pd.DataFrame(columns=cols)
and then inside the loop:
...
df.loc[idx, 'Title'] = title
df.loc[idx, 'Col_0'] = data_0
df.loc[idx, 'Col_1'] = data_1
df.loc[idx, 'Col_N'] = data_N
...
EDIT (because I found another way):
You can create a list with all the data and then passed them to a DataFrame:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
data = []
for pTitle in prodTitle:
itemName = pTitle
data.append([pTitle.text, pTitle.data_0, pTitle.data_1, ...])
columns = ['Title', 'Col_0', 'Col_1', ...]
df = pd.DataFrame(data=data, columns=columns)

Hitting Error while iterating the row under While loop followed by If Else condition for Pandas Dataframe

With my current task I am trying to iterate all of my excel rows and then copy each row as a new record to the destination template file. the source file and the destination files are different hence I am assigning the value to the destination cell.
# code run sample file
# import os
import pandas as pd
import xlwings as xw
newdir_path = " "
file1 = "list.xlsx"
# read the source file
data = pd.read_excel(file1, sheet_name=0, header=0,
index_col=False, keep_default_na=True)
# creating pandas dataframe from the source file
df = pd.DataFrame(data, columns=['Funktion', 'AD65', 'W70', 'B14', 'AC21'])
B14 = ['RR', 'BB', 'RA', 'MM']
booleans = []
# to iterate all the rows
for i in df.itertuples(index=True):
# only to read the rows where column AD65(2nd column) does not have blank value or none
while i[AD65] != 'None':
# to retrieve the row values
# when B14 value is RR
if B14 == 'RR':
print(i)
# retrieving the values
Funktion = i.Funktion
AD65 = i.AD65
W70 = i.W70
B14 = i.B14
AC21 = i.AC21
booleans.append(True)
# when B14 is B1
elif B14 == 'BB':
print(i)
Funktion = i.Funktion
AD65 = i.AD65
W70 = i.W70
B14 = i.B14
AC21 = i.AC21
booleans.append(True)
elif B14 == 'RA':
# repeating the same as above
booleans.append(True)
elif B14 == 'MM':
booleans.append(True)
else:
# I want to skip the rows when B14 value is blank
Note: I also tried my condition with if df.loc[['B14'] == 'RR']: and it is throwing raise KeyError(key)
KeyError: False

Change field name itertuple loop test

I need change field name ("newcolumn" ) :
def func(name_column):
data = [1,2,3,4,5]
df = pd.DataFrame(data, columns[a,b,c,d])
df["name_column"+"_1"]=df.name_column.shift(1)
for row in df.itertuples():
if row.a < row.(name_column+"_1"):
return print('ok)
I tried getattr(row,name_column+"_1") but without succes.

Using pandas drop duplicates but doesn't correctly drop the duplicates

First of all, I'm not sure whether it is drop_duplicates() fault or not.
What I want to do:
Import file from csv, do a re.search on every row, if match, keep the row inside a dictionary, if doesn't match, keep the row inside another dictionary. Make a graph out of the length of the dictionary value.
The problem
I have 1000 rows inside csv, but the result returns 1200.
My code
import pandas as pd
import re
# import data
filename = 'sample.csv'
# save data as data
data = pd.read_csv(filename, encoding='utf-8')
# create new dictionary for word that is true and false
# but doesn't have the keyword in items
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# if text is True
def word_in_text(word,text,label):
match = re.search(word,text)
if match and label == True:
kaiT.setdefault('text', []).append(text)
elif match and label == False:
kaiF.setdefault('text', []).append(text)
elif label == True and not match:
wordNT.setdefault('text', []).append(text)
elif label == False and not match:
wordNF.setdefault('text', []).append(text)
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
When I removed the line
word_in_text('bar', row['text'], row['label'])
and only keep
word_in_text('foo', row['text'], row['label'])
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount) returns 1000 correctly, and vice versa.
But when I don't, it returns 1200 when it should only be 1000?
CSV INPUT sample
text,label
"hey", TRUE
"halo", FALSE
"How are you?", TRUE
EXPECTED OUTPUT
1000
OUTPUT
1200
In the function word_in_text, you update the four dict: wordNT, wordNF, kaiT and kaiF.
And you call word_in_text twice while iterating the dataframe:
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
So the searching result is the mix of the result from 'foo' and result from 'bar'.
Instead, you should clean up the four dict before starting a new search:
def search(text):
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# iterate every text in data
for index, row in data.iterrows():
word_in_text(text, row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
search('foo')
search('bar')

Categories