Speed up pd.read_excel in python

Speed up pd.read_excel in python - python

i'm writing python that read excel data and import it to database. For 10,000~30,000 records is fine. But 150,000+ records it took me over 13 seconds. How i can speed up it ?
f = request.files['file']
all_data = {} #insert group data
df = pd.read_excel (f)
df = df.dropna(how='all')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.index.name = excel_conf['config']['default_config']['identification_column']['name'] #column header identification
df.index += 1 #raise index to a valid one
df = df.fillna("")
########## loop take times #########
for index, row in df.iterrows():
row_dict = []
for key in excel_conf['config']['column_config']: #column header name lists
row_dict.append({
key : row[key]
#key (from excel config) row[key] (row value from excel)
})
index_key = index_keygen(create_blake2s_signature(...)) #just create index key stuffs, i shorted it
# add child data to main
all_data[index_key] = row_dict
#"test_key" : { "key":"value",... }
####################################
insert_db(all_data) #this is fast

Related

Show excel data in only one Sheet

I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one

Add every scraped item to csv row pandas

I have a selenium project that scrape website and loop to get inner class text
I want to save every scraped text from this loop to a new csv row located next to the py file, and accept new columns if added in the future
How do i do that?
This is what i tried
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
for pTitle in prodTitle:
itemName = pTitle
pd = pd.dataframe(pTitle.text)
pd.to_csv('data.csv', pd)
print(pTitle.text)
but it add the last item only

You can add the data in the same loop and then save the whole dataframe, like this:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
df = pd.DataFrame(columns=['Title'])
for (idx,pTitle) in enumerate(prodTitle):
itemName = pTitle
df.loc[idx, 'Title'] = pTitle.text
print(pTitle.text)
df.to_csv('data.csv')
EDIT: to add more data it is convenient set the column before the loop, like this:
cols = ['Title', 'Col_0', 'Col_1', 'Col_N']
df = pd.DataFrame(columns=cols)
and then inside the loop:
...
df.loc[idx, 'Title'] = title
df.loc[idx, 'Col_0'] = data_0
df.loc[idx, 'Col_1'] = data_1
df.loc[idx, 'Col_N'] = data_N
...
EDIT (because I found another way):
You can create a list with all the data and then passed them to a DataFrame:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
data = []
for pTitle in prodTitle:
itemName = pTitle
data.append([pTitle.text, pTitle.data_0, pTitle.data_1, ...])
columns = ['Title', 'Col_0', 'Col_1', ...]
df = pd.DataFrame(data=data, columns=columns)

Hitting Error while iterating the row under While loop followed by If Else condition for Pandas Dataframe

With my current task I am trying to iterate all of my excel rows and then copy each row as a new record to the destination template file. the source file and the destination files are different hence I am assigning the value to the destination cell.
# code run sample file
# import os
import pandas as pd
import xlwings as xw
newdir_path = " "
file1 = "list.xlsx"
# read the source file
data = pd.read_excel(file1, sheet_name=0, header=0,
index_col=False, keep_default_na=True)
# creating pandas dataframe from the source file
df = pd.DataFrame(data, columns=['Funktion', 'AD65', 'W70', 'B14', 'AC21'])
B14 = ['RR', 'BB', 'RA', 'MM']
booleans = []
# to iterate all the rows
for i in df.itertuples(index=True):
# only to read the rows where column AD65(2nd column) does not have blank value or none
while i[AD65] != 'None':
# to retrieve the row values
# when B14 value is RR
if B14 == 'RR':
print(i)
# retrieving the values
Funktion = i.Funktion
AD65 = i.AD65
W70 = i.W70
B14 = i.B14
AC21 = i.AC21
booleans.append(True)
# when B14 is B1
elif B14 == 'BB':
print(i)
Funktion = i.Funktion
AD65 = i.AD65
W70 = i.W70
B14 = i.B14
AC21 = i.AC21
booleans.append(True)
elif B14 == 'RA':
# repeating the same as above
booleans.append(True)
elif B14 == 'MM':
booleans.append(True)
else:
# I want to skip the rows when B14 value is blank
Note: I also tried my condition with if df.loc[['B14'] == 'RR']: and it is throwing raise KeyError(key)
KeyError: False

Change field name itertuple loop test

I need change field name ("newcolumn" ) :
def func(name_column):
data = [1,2,3,4,5]
df = pd.DataFrame(data, columns[a,b,c,d])
df["name_column"+"_1"]=df.name_column.shift(1)
for row in df.itertuples():
if row.a < row.(name_column+"_1"):
return print('ok)
I tried getattr(row,name_column+"_1") but without succes.

Using pandas drop duplicates but doesn't correctly drop the duplicates

First of all, I'm not sure whether it is drop_duplicates() fault or not.
What I want to do:
Import file from csv, do a re.search on every row, if match, keep the row inside a dictionary, if doesn't match, keep the row inside another dictionary. Make a graph out of the length of the dictionary value.
The problem
I have 1000 rows inside csv, but the result returns 1200.
My code
import pandas as pd
import re
# import data
filename = 'sample.csv'
# save data as data
data = pd.read_csv(filename, encoding='utf-8')
# create new dictionary for word that is true and false
# but doesn't have the keyword in items
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# if text is True
def word_in_text(word,text,label):
match = re.search(word,text)
if match and label == True:
kaiT.setdefault('text', []).append(text)
elif match and label == False:
kaiF.setdefault('text', []).append(text)
elif label == True and not match:
wordNT.setdefault('text', []).append(text)
elif label == False and not match:
wordNF.setdefault('text', []).append(text)
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
When I removed the line
word_in_text('bar', row['text'], row['label'])
and only keep
word_in_text('foo', row['text'], row['label'])
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount) returns 1000 correctly, and vice versa.
But when I don't, it returns 1200 when it should only be 1000?
CSV INPUT sample
text,label
"hey", TRUE
"halo", FALSE
"How are you?", TRUE
EXPECTED OUTPUT
1000
OUTPUT
1200

In the function word_in_text, you update the four dict: wordNT, wordNF, kaiT and kaiF.
And you call word_in_text twice while iterating the dataframe:
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
So the searching result is the mix of the result from 'foo' and result from 'bar'.
Instead, you should clean up the four dict before starting a new search:
def search(text):
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# iterate every text in data
for index, row in data.iterrows():
word_in_text(text, row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
search('foo')
search('bar')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Speed up pd.read_excel in python - python

Related

Show excel data in only one Sheet

Add every scraped item to csv row pandas

Hitting Error while iterating the row under While loop followed by If Else condition for Pandas Dataframe

Change field name itertuple loop test

Using pandas drop duplicates but doesn't correctly drop the duplicates

Categories

Resources