Python nested for loop ordering - python

i am having issues get a nested for loop to output individual csv files for an API call. The API call is paginated, so we have to query the API multiple times and append the data Also have to loop through for every exchange.
The way the code is now it's only outputting the last page of data for a couple of exchanges and the the following exchanges just have 'name' in the CSV, no other data...
from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()
import pandas as pd
import time
##grab a list of all the exchangeslisted on CG
ex_list = cg.get_exchanges_list()
#normalise the json
df = pd.json_normalize(ex_list)
#output to csv
#df.to_csv('exchange_list.csv', encoding='utf-8', index=False)
#make a list with just one column
id_list = df['id'].to_list()
def read_exchange_tickers():
for x in id_list:
for i in range(1,10):
appended_data = []
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
#time.sleep(10)
#define path + filename
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)
read_exchange_tickers()

You should collect all data for each id and then save the data to file.
def read_exchange_tickers():
for x in id_list:
appended_data = []
# collect all the data for current id
for i in range(1,10):
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
# save the data to csv
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)

Related

Split large DataFrame into Dataframes containing records of unique values in a column

A csv file has 90 million rows. One of the Columns in named "State". It has 12 unique values at present. (The count of unique values in the "State" column is dynamic and can change with each csv file.)
I want to split the DataFrame into smaller chunks and then save State-wise files.
The code below is not working.
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=1000000)):
dfs = dict(tuple(chunk.groupby('State')))
for i, df in dfs.items():
df = df.append(df)
df.to_csv("tempcsv/" + i +".csv",sep=",", index = False)
IIUC, Try:
source_path = "DataJune.txt"
from collections import defaultdict
def def_value():
return pd.DataFrame()
# Defining the dict
d = defaultdict(def_value)
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
d[state]=d[state].append(chunk[chunk['State']==state])
for i, df in d.items():
df.to_csv("tempcsv/" + str(i) +".csv",sep=",", index = False)
Another version, based on the #Corralien comment:
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
with open("tempcsv/" + str(state) +".csv",mode='a+') as file:
for i, row in chunk[chunk['State']==state].iterrows():
file.write(','.join([str(x) for x in row]))
file.write('\n')
Another version:
source_path = "DataJune.txt"
from os.path import exists
import csv
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
path = "tempcsv/" + str(state) +".csv"
file_exists = exists(path)
if not file_exists:
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerow(chunk.columns)
print(chunk.columns)
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerows(chunk[chunk['State']==state].values)
You can use:
import pandas as pd
import os
source_path = 'DataJune.txt'
fps = {}
for chunk in pd.read_csv(source_path, sep='|', chunksize=1000000, dtype=object):
for state, df in chunk.groupby('State'):
# New state, create a new file and write headers
if state not in fps:
fps[state] = open(f'tempcsv/{state}.csv', 'w')
fps[state].write(f"{','.join(df.columns)}{os.linesep}")
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
# Close files properly
for fp in fps.values():
fp.close()
del fps
Update
Try to replace:
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
By
# Write data without headers
g = (row.strip() for row in df.to_csv(index=False, header=None, sep=',').split(os.linesep) if row)
print(*g, sep=os.linesep, file=fps[state])

Code to read txt files and store in a python dataframe. Some files are not read and give NaN value

While reading txt files from folders, some files return NaN while the actual files are not blank.
Unable to figure out why those files are being left out.
import pandas as pd
import os
def get_transcripts(file_path):
try:
with open(file_path, 'r') as t:
text = t.read()
return text
except:
print(file_path)
# parent directory path
path = 'D:\\Urja\\Data_Analytics_project\\NPR_Podcasts\\NPR_Podcasts\\'
# dataframe to load the transcripts
df = pd.DataFrame()
for podcast in os.listdir(path)[0:20]: # gets transcripts for the first podcast
podcast_name = podcast
id = 1
for episode in os.listdir(path + podcast + '/'):
episode_name = episode.replace('.txt', '')
transcript = get_transcripts(path + podcast + '/' + episode)
temp = pd.DataFrame.from_dict({'id': [id],
'podcast_name': [podcast_name],
'episode_name': [episode_name],
'transcript': [transcript]},
orient='columns')
df = pd.concat([df, temp], axis=0).reset_index(drop=True)
id += 1
df

API request loop through pagination in python

i am looking to loop through the coingecko api for all of the exchanges listed on there, pull the tickers that are listed for each exchange [this is paginated to 100 rows], loop through all of the pages. There is no way of telling how many. Then store all rows out to a CSV.
Here is what I have came up with so far.
from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()
import pandas as pd
#grab a list of all the exchangeslisted on CG
ex_list = cg.get_exchanges_list()
#df_ex_list = pd.read_json(exchanges_list)
df = pd.json_normalize(ex_list)
#output to csv
df.to_csv('exchange_list.csv', encoding='utf-8', index=False)
id_list = df['id'].tolist()
def get_ex_tickers():
for x in id_list:
# get tickers
d = cg.get_exchanges_tickers_by_id(x, page_integer = 2)###the num of page integers is not know
#import into pandas df
df = pd.json_normalize(d, record_path=['tickers'], meta=['name'])
#define path + filename
path = 'ticker_lists/'
filename = path+ x +'_ticker_list' + '.csv'
#output to csv
df.to_csv(filename, encoding='utf-8', index=False)
get_ex_tickers()

Taking Same Worksheet from a Folder of xlsm Files with Python

I'm new to pandas/python and Ive come up with the following code to extract data from a specific part of a worksheet.
import openpyxl as xl
import pandas as pd
rows_with_data = [34,37,38,39,44,45,46,47,48,49, 50,54,55,57,58,59,60,62,63,64,65,66,70,71,72,76,77, 78,79,80,81,82,83,84,88,89,90,91,92]
path = r'XXX'
xpath = input('XXX')
file = r'**.xlsm'
xfile = input('Change file name, current is ' + file + ' :')
sheetname = r'Summary'
wb = xl.load_workbook(filename = xpath + '\\' +file, data_only = True)
sheet = wb.get_sheet_by_name(sheetname)
rows = len(rows_with_data)
line_items = []
for i in range(rows) :
line_items.append(sheet.cell(row = rows_with_data[i], column = 13).value)
period = []
for col in range(17,35):
period.append(sheet.cell(row = 20, column = col).value)
print(line_items)
vals = []
x = []
for i in range(rows):
if i != 0:
vals.append(x)
x = []
for col in range(17,35):
x.append(sheet.cell(row = rows_with_data[i], column = col).value)
vals.append(x)
all_values = {}
all_values['Period'] = period
for i in range(rows):
print(line_items[i])
all_values[line_items[i]] = vals[i]
print(all_values)
period_review = input('Enter a period (i.e. 2002): ')
item = input('Enter a period (i.e. XXX): ')
time = period.index(period_review)
display_item = str(all_values[item][time])
print(item + ' for ' + period_review + " is " + display_item)
Summary_Dataframe = pd.DataFrame(all_values)
writer = pd.ExcelWriter(xpath + '\\' + 'values.xlsx')
Summary_Dataframe.to_excel(writer,'Sheet1')
writer.save()
writer.close()
I have the same worksheet (summary results) across a library of 60 xlsm files and I'm having a hard time figuring out how to iterate this across the entire folder of files. I also want change this from extracting specific rows to taking the entire "Summary" worksheet, pasting it to the new file and naming the worksheet by its filename ("Experiment_A") when pasted to the new excel file. Any advice?
I was having hard time to read your code to understand that what you want to do finally. So it is just an advice not a solution. You can iterate through all files in the folder using os then read the files in to one dataframe then save the single big data frame in to csv. I usually avoid excel but I guess you need the excel conversion. In the example below I have read all txt file from a directory put them in to dataframe list then store the big data frame as json. You can also store it as excel/csv.
import os
import pandas as pd
def process_data():
# input file path in 2 part in case it is very long
input_path_1 = r'\\path\to\the\folder'
input_path_2 = r'\second\part\of\the\path'
# adding the all file path
file_path = input_path_1 + input_path_2
# listing all file in the file folder
file_list = os.listdir(os.path.join(file_path))
# selecting only the .txt files in to a list object
file_list = [file_name for file_name in file_list if '.txt' in file_name]
# selecting the fields we need
field_names = ['country', 'ticket_id']
# defining a list to put all the datafremes in one list
pd_list = []
inserted_files = []
# looping over txt files and storing in to database
for file_name in file_list:
# creating the file path to read the file
file_path_ = file_path + '\\' + file_name
df_ = pd.read_csv(os.path.join(file_path_), sep='\t', usecols=field_names)
# converting the datetime to date
# few internal data transformation example before writting
df_['sent_date'] = pd.to_datetime(df_['sent_date'])
df_['sent_date'] = df_['sent_date'].values.astype('datetime64[M]')
# adding each dataframe to the list
pd_list.append(df_)
# adding file name to the inserted list to print later
inserted_files.append(file_name)
print(inserted_files)
# sql like union all dataframes and create a single data source
df_ = pd.concat(pd_list)
output_path_1 = r'\\path\to\output'
output_path_2 = r'\path\to\output'
output_path = output_path_1 + output_path_2
# put the file name
file_name = 'xyz.json'
# adding the day the file processed
df_['etl_run_time'] = pd.to_datetime('today').strftime('%Y-%m-%d')
# write file to json
df_.to_json(os.path.join(output_path, file_name), orient='records')
return print('Data Stored as json successfully')
process_data()

i want to write looping dataframe to excel

1.I am new to python.this task for mainly read the excel files in directory and filter the data in excel. After filtering write into excel.When iam trying to write to excel its storing only last iteration values.Please give advise to write all data to excel . I want to write df_filter and df_filter1 to excel which is for loop .Please help me i need to write these dataframe to excell
import os
import xlrd
import pandas as pd
import xlwt
from openpyxl import load_workbook
import xlsxwriter
from pyexcelerate import Workbook
import numpy as np
from pandas import ExcelWriter
from tempfile import TemporaryFile
ALL_SHEETS = []
sheet_list = ""
file_path = os.path.join(input("enter Dir path"))
config_path = os.path.join(input("enter your config file path here"))
output_path = os.path.join(input("Dude where you want store outputfile"))
output1 = pd.ExcelWriter(output_path, engine='xlsxwriter')
ALL_SHEETS = [os.path.join(file_path, f) for f in os.listdir(file_path)
if os.path.isfile(os.path.join(file_path, f))
and f.endswith('.xlsx')]
i = 0
data1 = []
data = []
Packet_size = []
Trail_numbers = []
Though_put = []
Latency = []
Jitter = []
df_filter = pd.DataFrame(columns=['packetsize', 'throughput', 'latency (us)', 'jitter (us)'])
df_filter1 = pd.DataFrame(columns=['packetsize', 'throughput', 'latency (us)', 'jitter (us)'])
#df_sheet = pd.DataFrame(columns=['zsheet'])
merged_inner=pd.DataFrame([])
def sheets(val):
s = wb.worksheets[val]
df_sheet = pd.DataFrame( data=['%s' % str(s) + '\n'])
#Name_sheet(s)
HeaderList = pd.read_csv(config_path)
column_list = []
for col in HeaderList:
col = col.lstrip("'")
col = col.rstrip("'")
column_list.append(col)
df1 = xl.parse(sheet_list[val], skiprows=i)
df1 = df1.filter(column_list)
df2 = df1[(df1['Result'] != 'Failed') & (df1['Frame Size Type'] == 'iMIX')]
if df2.empty:
pass
else:
final3= df2.groupby(['Trial Number', 'iMIX Distribution'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
#df_filter['sheetaname']=df_sheet(lambda a:'%s' % a['sheetvise'],axis=1)
final = final3.groupby(['iMIX Distribution'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
df_filter['packetsize'] = final.apply(lambda z: '%s' % (z['iMIX Distribution']), axis=1)
df_filter['throughput'] = final.apply(lambda z: '%s' % (z['Throughput (%)']), axis=1)
df_filter['latency (us)'] = final.apply(lambda x: '%s/%s/%s' % (x['Minimum Latency (us)'], x['Maximum Latency (us)'], x['Average Latency (us)']),axis=1)
df_filter['jitter (us)'] = final.apply(lambda y: '%s/%s/%s' % (y['Minimum Jitter (us)'], y['Maximum Jitter (us)'], y['Average Jitter (us)']),axis=1)
df_filter.to_excel(output1,sheet_name='mani')
output1.save()
df_filter.to_excel(output1, startrow=len(df_filter1)+len(df_filter)+2,sheet_name='mani')
output1.save()
df3 = df1[(df1['Result'] != 'Failed') & (df1['Frame Size Type'] == 'Fixed')]
if df3.empty:
pass
else:
final2 = df3.groupby(['Trial Number', 'Configured Frame Size'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
final1=final2.groupby(['Configured Frame Size'],sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
df_filter1['packetsize'] = final1.apply(lambda z: '%s' % (z['Configured Frame Size']), axis=1)
df_filter1['throughput'] = final1.apply(lambda z: '%s' % (z['Throughput (%)']), axis=1)
df_filter1['latency (us)'] = final1.apply(lambda x: '%s/%s/%s' % (x['Minimum Latency (us)'], x['Maximum Latency (us)'], x['Average Latency (us)']),axis=1)
df_filter1['jitter (us)'] = final1.apply(lambda y: '%s/%s/%s' % (y['Minimum Jitter (us)'], y['Maximum Jitter (us)'], y['Average Jitter (us)']),axis=1)
df_filter1.to_excel(output1, sheet_name='mani')
df_filter1.to_excel(output1, startrow=len(df_filter1)+len(df_filter) + 2, sheet_name='mani')
output1.save()
def sheet_every():
for sheet in range(0, sheet_list_lenght):
sheets(sheet)
for file in (ALL_SHEETS):
df_file = pd.DataFrame(data=[file])
workbook = xlrd.open_workbook(file)
wb = load_workbook(file)
xl = pd.ExcelFile(file)
i = 0
sheet_list = workbook.sheet_names()
sheet_list_lenght = (len(sheet_list))
for sheet in sheet_list:
worksheet = workbook.sheet_by_name(sheet)
for i in range(0, worksheet.nrows):
row = worksheet.row_values(i)
if 'Trial Number' in row:``
break
sheet_every()
Not sure if this answers your question or not, but if you want to read from a dataframe and add rows to a new dataframe thorugh a loop you can refer the code below:
dummyData = pd.read_csv("someexcelfile.csv")
#You can merge mutiple dataframes into dummyData and make it a big dataframe
dummyInsertTable = pd.DataFrame(columns=["Col1","Col2","Col3"])
for i in range(len(dummyData)):
dummyInsertTable.loc[i,"Col1"] = dummyData["Col1"][i]
dummyInsertTable.loc[i, "Col2"] = dummyData["Col2"][i]
dummyInsertTable.loc[i, "Col3"] = dummyData["Col3"][i]
dummyInsertTable.to_csv("writeCSVFile.csv")
And next time be precise where you are facing the problem.
EDIT
Try loading the first dataframe and then loop through the other files and append the files in the first dataframe. Refer the code:
import pandas as pd
#Make a list of all the file you have
filesList = ["/home/bhushan/firstFile.csv","/home/bhushan/secondFile.csv","/home/bhushan/thirdFile.csv","/home/bhushan/fourthFile.csv"]
#Read the first csv file using pandas.read_csv
firstFile = pd.read_csv(filesList[0])
#Loop through the rest of the files and append the files in the first DataFrame
for i in range(1,len(filesList)):
fileToBeAdded = pd.read_csv(filesList[i])
firstFile = firstFile.append(fileToBeAdded)
#Write the final file
finalFile = firstFile
finalFile.to_csv("finalFile.csv")
If I get your question correctly, you have two data frames which you want to write to one excel file but you are only getting the last one.
You should write them to two different sheets instead, then you can retrieve them as per requirement, either individually or combined.
Follow the below links for more details and implementation :
https://xlsxwriter.readthedocs.io/example_pandas_multiple.html
https://campus.datacamp.com/courses/importing-managing-financial-data-in-python/importing-stock-listing-data-from-excel?ex=11
Also, you can instead write to a csv file, that is also excel compatible and easier to handle. Also I have observed that it is faster and more space efficient compared to writing to .xlsx file.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

Categories