I need change field name ("newcolumn" ) :
def func(name_column):
data = [1,2,3,4,5]
df = pd.DataFrame(data, columns[a,b,c,d])
df["name_column"+"_1"]=df.name_column.shift(1)
for row in df.itertuples():
if row.a < row.(name_column+"_1"):
return print('ok)
I tried getattr(row,name_column+"_1") but without succes.
Related
I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one
I have a selenium project that scrape website and loop to get inner class text
I want to save every scraped text from this loop to a new csv row located next to the py file, and accept new columns if added in the future
How do i do that?
This is what i tried
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
for pTitle in prodTitle:
itemName = pTitle
pd = pd.dataframe(pTitle.text)
pd.to_csv('data.csv', pd)
print(pTitle.text)
but it add the last item only
You can add the data in the same loop and then save the whole dataframe, like this:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
df = pd.DataFrame(columns=['Title'])
for (idx,pTitle) in enumerate(prodTitle):
itemName = pTitle
df.loc[idx, 'Title'] = pTitle.text
print(pTitle.text)
df.to_csv('data.csv')
EDIT: to add more data it is convenient set the column before the loop, like this:
cols = ['Title', 'Col_0', 'Col_1', 'Col_N']
df = pd.DataFrame(columns=cols)
and then inside the loop:
...
df.loc[idx, 'Title'] = title
df.loc[idx, 'Col_0'] = data_0
df.loc[idx, 'Col_1'] = data_1
df.loc[idx, 'Col_N'] = data_N
...
EDIT (because I found another way):
You can create a list with all the data and then passed them to a DataFrame:
prodTitle = driver.find_elements_by_xpath("//*[contains(#class,'itemTitle')]")
data = []
for pTitle in prodTitle:
itemName = pTitle
data.append([pTitle.text, pTitle.data_0, pTitle.data_1, ...])
columns = ['Title', 'Col_0', 'Col_1', ...]
df = pd.DataFrame(data=data, columns=columns)
i'm writing python that read excel data and import it to database. For 10,000~30,000 records is fine. But 150,000+ records it took me over 13 seconds. How i can speed up it ?
f = request.files['file']
all_data = {} #insert group data
df = pd.read_excel (f)
df = df.dropna(how='all')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.index.name = excel_conf['config']['default_config']['identification_column']['name'] #column header identification
df.index += 1 #raise index to a valid one
df = df.fillna("")
########## loop take times #########
for index, row in df.iterrows():
row_dict = []
for key in excel_conf['config']['column_config']: #column header name lists
row_dict.append({
key : row[key]
#key (from excel config) row[key] (row value from excel)
})
index_key = index_keygen(create_blake2s_signature(...)) #just create index key stuffs, i shorted it
# add child data to main
all_data[index_key] = row_dict
#"test_key" : { "key":"value",... }
####################################
insert_db(all_data) #this is fast
I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')
I have a Pandas dataframe, numeric_df, with a bunch of columns. I have this function:
def textstat_stats(text):
difficulty = textstat.flesch_reading_ease(text)
grade_difficulty = textstat.flesch_kincaid_grade(text)
gfog = textstat.gunning_fog(text)
smog = textstat.smog_index(text)
ari = textstat.automated_readability_index(text)
cli = textstat.coleman_liau_index(text)
lwf = textstat.linsear_write_formula(text)
dcrs = textstat.dale_chall_readability_score(text)
return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs])
which returns a Pandas Series. Now I'm trying this:
numeric_df[['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']] = textstat_stats(text)
However, I get this Error:
KeyError: "['difficulty' 'grade_difficulty' 'gfog' 'smog' 'ari' 'cli' 'lwf' 'dcrs'] not in index"
What am I doing incorrectly?
Thanks!
It seems you need add index to Series which create columns names:
def textstat_stats(text):
difficulty = textstat.flesch_reading_ease(text)
grade_difficulty = textstat.flesch_kincaid_grade(text)
gfog = textstat.gunning_fog(text)
smog = textstat.smog_index(text)
ari = textstat.automated_readability_index(text)
cli = textstat.coleman_liau_index(text)
lwf = textstat.linsear_write_formula(text)
dcrs = textstat.dale_chall_readability_score(text)
idx = ['difficulty', 'grade_difficulty','gfog','smog','ari','cli','lwf','dcrs']
return pd.Series([difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs],
index=idx)
df = textstat_stats(text)
print (df)