so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas
you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.
Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')
Related
I have many text files on the GCS and I want to load them into the BigQuery. I already loaded them, but it inserts fields in different columns.
Here are my code.
def string_2_dataframe(string, names, widths,audit_load_key):
io_string = StringIO(string)
dataframe = pandas.read_csv(
io_string, names=names, sep="\t", dtype=str)
dataframe = dataframe.fillna("")
dataframe = dataframe.replace("nan", "")
dataframe['audit_load_key'] = audit_load_key
print(dataframe)
return dataframe
def dataframe_2_bigquery(dataframe, table_id, job_config):
bigquery_client = bigquery.Client()
# Make an API request.
job = bigquery_client.load_table_from_dataframe(
dataframe, config[table_id])
job.result() # Wait for the job to complete.
table = bigquery_client.get_table(config[table_id])
return "Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), config[table_id])
def blob_2_bigquery(blob_name, fnames, fwidths, table_id, job_config,audit_load_key):
fs = GCSFileSystem()
source = f"{config['bucket']}/{blob_name}"
with fs.open(source, "r", encoding="utf-8", errors="ignore") as source:
count = 0
block = ""
while True:
line = source.readline()
block += line
count += 1
if count % 10000 == 0:
dataframe = string_2_dataframe(block, fnames, fwidths,audit_load_key)
result = dataframe_2_bigquery(dataframe, table_id, job_config)
block = ""
if not line:
dataframe = string_2_dataframe(block, fnames, fwidths,audit_load_key)
result = dataframe_2_bigquery(dataframe, table_id, job_config)
break
And here is my sample text file.
Please help me to solve this issue.
I am trying to convert nested JSON to CSV using python.
There can be multiple values for some attributes:
like phone1, phone2,phone3 for a single individual.
I wrote a python code for this and its doing the job perfectly.
But I am getting the values of multiple attributes in multiple columns but I want them in multiple rows,
like for example:
my output:
name phone1 phone2 phone3
xyx 98 34 56
required output:
name phone
xyx 98
xyx 34
xyx 56
The code for this is:
import sys
import json
import csv
import io
import pandas as pd
##
# Convert to string keeping encoding in mind...
##
processed_data = []
def to_string(s):
try:
return str(s)
except:
# Change the encoding type if needed
return s.encode('utf-8')
def reduce_item(key, value):
global reduced_item
global res
# Reduction Condition 1
if type(value) is list:
i = 0
for sub_item in value:
reduce_item(key + '_' + to_string(i), sub_item)
i = i + 1
# Reduction Condition 2
elif type(value) is dict:
sub_keys = value.keys()
for sub_key in sub_keys:
reduce_item(key+'_'+to_string(sub_key), value[sub_key])
# Base Condition
else:
#if reduced_item.get(to_string(key)):
#reduced_item[to_string(key)] = to_string(value)
#processed_data.append(reduced_item)
#else:
reduced_item[to_string(key)]=to_string(value)
if __name__ == "__main__":
if len(sys.argv) != 4:
print("\nUsage: python json_to_csv.py <node> <json_in_file_path> <csv_out_file_path>\n")
else:
# Reading arguments
node = sys.argv[1]
json_file_path = sys.argv[2]
csv_file_path = sys.argv[3]
with io.open(json_file_path, 'r', encoding='utf-8-sig') as fp:
json_value = fp.read()
raw_data = json.loads(json_value)
try:
data_to_be_processed = raw_data[node]
except:
data_to_be_processed = raw_data
header = []
for item in data_to_be_processed:
reduced_item={}
reduce_item(node, item)
header += reduced_item.keys()
processed_data.append(reduced_item)
header = list(set(header))
header.sort()
with open(csv_file_path, 'w+') as f:
writer = csv.DictWriter(f, header,quoting=csv.QUOTE_ALL)
writer.writeheader()
for row in processed_data:
writer.writerow(row)
print("Just completed writing csv file with %d columns" % len(header))
I tried changing the code but was not able to achieve the desired result.It would be of great help if anyone can suggest the changes for this code
Thanks in advance
I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()
import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image
this function reads from a text file and re-formats the contents, and then writes the contents to a csv. I'm trying to use threading to multi-thread the for i in lines loop, this is the longest part of a larger script and takes up most of the run time because the list lines contains thousands of elements. Can someone help me straighten this out? Doing this synchronously instead of in parallel is taking up tons of time. I have seen many other answers to similar questions but I've yet to understand the answers and implement them correctly so far.
def sheets(i):
# time format for spreadsheet
dt_time = datetime.now().strftime('%m/%d|%H:%M')
# for league name (NFL,NBA,NHL ETC.) in list containing league names
for league_name in leagues2:
league_name = league_name.split('|')[0]
with open(final_stats_path, 'r+') as lines:
lines = lines.readlines()
# i = one long string containg details about the event in the loop, eg. sport, game day, game id, home team name
for i in lines:
i = i.split(',')
minprice = i[6]
totaltix = i[5]
event_date = i[2]
try:
dayofweek = datetime.strptime(event_date, '%Y-%m-%d').strftime('%A')
except:
continue
event_date = i[2][2:]
event_date = str(event_date).split('-')
event_date = event_date[1]+'/'+event_date[2]
sport = i[4]
event = i[1].replace('Basketball','').replace('\n','')
away = i[8].replace('Basketball', '').replace('\n','')
eventid = i[0]
event_home = i[9].replace('Basketball', '').replace('\n','')
event = event.split(' at ')[0]
tixdata = str(totaltix)
eventid = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId='+str(eventid)+'§ionId=0'
directory = root+'\data'+'\\'+sport+'\\'
report = directory+'report.xlsx'
fname = directory+'teams.txt'
eventleague = sport
f = open(directory+'acronym.txt', 'r+')
lines_2 = f.readlines()
for qt in lines_2:
qt = qt.split('-')
compare = qt[1]
if event_home in compare:
event_home = qt[0]
else:
pass
troop = []
d = {
'ID' : eventid,
'Date' : event_date,
'Day' : dayofweek,
'Away' : away,
}
s = {
'time' : tixdata
}
numbers = event_home+'.txt'
numbers_new = 'bk\\bk_'+numbers
with open(directory+numbers_new, 'a+') as y:
pass
with open(directory+numbers, 'a+') as o:
pass
with open(directory+numbers, 'r+') as g:
for row in g:
if str(eventid) in row:
#print('the event is in the list')
row_update = row.replace('}', ", '"+dt_time+"': '"+tixdata+"'}")
with open(directory+numbers_new, 'a+') as y:
y.write(row_update)
break
else:
with open(directory+numbers, 'a+') as p:
#print('the event is not in the list')
p.write(str(d)+'\n')
with open(directory+numbers_new, 'a+') as n:
n.write(str(d)+'\n')
sizefile = os.path.getsize(directory+numbers_new)
if sizefile > 0:
shutil.copy(directory+numbers_new, directory+numbers)
open(directory+numbers_new, 'w').close()
else:
pass
df = []
with open(directory+numbers, 'r+') as t:
for row in t:
b = eval(row)
dfs = df.append(b)
df = pd.DataFrame(df)
yark = list(df.columns)[:-5]
zed = ['ID', 'Date', 'Day', 'Away']
columns = zed+yark
try:
df = df[columns]
except:
pass
df.index = range(1, 2*len(df)+1, 2)
df = df.reindex(index=range(2*len(df)))
writer = pd.ExcelWriter(directory+event_home+'.xlsx', engine='xlsxwriter')
try:
df.to_excel(writer, sheet_name=event_home)
except:
continue
workbook = writer.book
worksheet = writer.sheets[event_home]
format1 = workbook.add_format({'num_format': '#,##0.00'})
worksheet.set_column('A:ZZ', 18, format1)
writer.save()
if __name__ == "__main__":
pool = ThreadPool(8) # Make the Pool of workers
results = pool.map(sheets) #Open the urls in their own threads
pool.close() #close the pool and wait for the work to finish
pool.join()
##get_numbers()
##stats_to_csv()
##stats_to_html()
#sheets()
Try changing the following line:
results = pool.map(sheets)
to:
results = pool.map(sheets,range(8))