I have many text files on the GCS and I want to load them into the BigQuery. I already loaded them, but it inserts fields in different columns.
Here are my code.
def string_2_dataframe(string, names, widths,audit_load_key):
io_string = StringIO(string)
dataframe = pandas.read_csv(
io_string, names=names, sep="\t", dtype=str)
dataframe = dataframe.fillna("")
dataframe = dataframe.replace("nan", "")
dataframe['audit_load_key'] = audit_load_key
print(dataframe)
return dataframe
def dataframe_2_bigquery(dataframe, table_id, job_config):
bigquery_client = bigquery.Client()
# Make an API request.
job = bigquery_client.load_table_from_dataframe(
dataframe, config[table_id])
job.result() # Wait for the job to complete.
table = bigquery_client.get_table(config[table_id])
return "Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), config[table_id])
def blob_2_bigquery(blob_name, fnames, fwidths, table_id, job_config,audit_load_key):
fs = GCSFileSystem()
source = f"{config['bucket']}/{blob_name}"
with fs.open(source, "r", encoding="utf-8", errors="ignore") as source:
count = 0
block = ""
while True:
line = source.readline()
block += line
count += 1
if count % 10000 == 0:
dataframe = string_2_dataframe(block, fnames, fwidths,audit_load_key)
result = dataframe_2_bigquery(dataframe, table_id, job_config)
block = ""
if not line:
dataframe = string_2_dataframe(block, fnames, fwidths,audit_load_key)
result = dataframe_2_bigquery(dataframe, table_id, job_config)
break
And here is my sample text file.
Please help me to solve this issue.
Related
¿Could anybody help me with the xml transformation to csv?
I have a nested xml file with around 200 columns and I need to choose some of them (id, email, date etc.) With the code below I obtain a csv file but only with 20 columns and it takes only the information from the first row.
data = s3.get_object(Bucket=source_bucket, Key=file_key)
contents = data['Body'].read()
tree = ET.fromstring(contents)
root = tree.findall(".")
# open a temp file for writing
csv_data = open('/tmp/converted.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(csv_data)
csv_head = []
csv_body = []
count = 0
for i in range(len(root[0])):
for x in root[0][i]:
if count == 0:
csv_head.append(x.tag)
csv_body.append(x.text)
print("***********")
print(x.text)
count = count + 1
csv_body.append("\n")
csvwriter.writerow(csv_head)
csvwriter.writerow(csv_body)
csv_data.close()
I also tried the next code, but it does not work:
csv_head = ['order_id', 'date', 'email']
csv_body = []
for i in root:
order_id = i.find('order_id')
if order_id != None:
order_id = order_id.text
date = i.find('date')
if date != None:
date = date.text
email = i.find('email')
if email != None:
email = email.text
csv_body.append({'order_id': order_id, 'date': date, 'email': email})
print("***********")
csvwriter.writerow(csv_head)
csvwriter.writerow(csv_body)
csv_data.close()
import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image
so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas
you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.
Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')
this function reads from a text file and re-formats the contents, and then writes the contents to a csv. I'm trying to use threading to multi-thread the for i in lines loop, this is the longest part of a larger script and takes up most of the run time because the list lines contains thousands of elements. Can someone help me straighten this out? Doing this synchronously instead of in parallel is taking up tons of time. I have seen many other answers to similar questions but I've yet to understand the answers and implement them correctly so far.
def sheets(i):
# time format for spreadsheet
dt_time = datetime.now().strftime('%m/%d|%H:%M')
# for league name (NFL,NBA,NHL ETC.) in list containing league names
for league_name in leagues2:
league_name = league_name.split('|')[0]
with open(final_stats_path, 'r+') as lines:
lines = lines.readlines()
# i = one long string containg details about the event in the loop, eg. sport, game day, game id, home team name
for i in lines:
i = i.split(',')
minprice = i[6]
totaltix = i[5]
event_date = i[2]
try:
dayofweek = datetime.strptime(event_date, '%Y-%m-%d').strftime('%A')
except:
continue
event_date = i[2][2:]
event_date = str(event_date).split('-')
event_date = event_date[1]+'/'+event_date[2]
sport = i[4]
event = i[1].replace('Basketball','').replace('\n','')
away = i[8].replace('Basketball', '').replace('\n','')
eventid = i[0]
event_home = i[9].replace('Basketball', '').replace('\n','')
event = event.split(' at ')[0]
tixdata = str(totaltix)
eventid = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId='+str(eventid)+'§ionId=0'
directory = root+'\data'+'\\'+sport+'\\'
report = directory+'report.xlsx'
fname = directory+'teams.txt'
eventleague = sport
f = open(directory+'acronym.txt', 'r+')
lines_2 = f.readlines()
for qt in lines_2:
qt = qt.split('-')
compare = qt[1]
if event_home in compare:
event_home = qt[0]
else:
pass
troop = []
d = {
'ID' : eventid,
'Date' : event_date,
'Day' : dayofweek,
'Away' : away,
}
s = {
'time' : tixdata
}
numbers = event_home+'.txt'
numbers_new = 'bk\\bk_'+numbers
with open(directory+numbers_new, 'a+') as y:
pass
with open(directory+numbers, 'a+') as o:
pass
with open(directory+numbers, 'r+') as g:
for row in g:
if str(eventid) in row:
#print('the event is in the list')
row_update = row.replace('}', ", '"+dt_time+"': '"+tixdata+"'}")
with open(directory+numbers_new, 'a+') as y:
y.write(row_update)
break
else:
with open(directory+numbers, 'a+') as p:
#print('the event is not in the list')
p.write(str(d)+'\n')
with open(directory+numbers_new, 'a+') as n:
n.write(str(d)+'\n')
sizefile = os.path.getsize(directory+numbers_new)
if sizefile > 0:
shutil.copy(directory+numbers_new, directory+numbers)
open(directory+numbers_new, 'w').close()
else:
pass
df = []
with open(directory+numbers, 'r+') as t:
for row in t:
b = eval(row)
dfs = df.append(b)
df = pd.DataFrame(df)
yark = list(df.columns)[:-5]
zed = ['ID', 'Date', 'Day', 'Away']
columns = zed+yark
try:
df = df[columns]
except:
pass
df.index = range(1, 2*len(df)+1, 2)
df = df.reindex(index=range(2*len(df)))
writer = pd.ExcelWriter(directory+event_home+'.xlsx', engine='xlsxwriter')
try:
df.to_excel(writer, sheet_name=event_home)
except:
continue
workbook = writer.book
worksheet = writer.sheets[event_home]
format1 = workbook.add_format({'num_format': '#,##0.00'})
worksheet.set_column('A:ZZ', 18, format1)
writer.save()
if __name__ == "__main__":
pool = ThreadPool(8) # Make the Pool of workers
results = pool.map(sheets) #Open the urls in their own threads
pool.close() #close the pool and wait for the work to finish
pool.join()
##get_numbers()
##stats_to_csv()
##stats_to_html()
#sheets()
Try changing the following line:
results = pool.map(sheets)
to:
results = pool.map(sheets,range(8))
I'm trying to run parallel threads in a spark job. This works without a hitch when I run the python script from the cli, but my understanding is that is not really capitalizing on the EMR cluster parallel processing benefits. It does not actually save the data when I run as a spark job. I'm not even sure it's creating the spark dataframe when I run it as a spark job.
I also tried doing using map instead of doing the parallel threads, but couldn't get that to work either.
If I can't get the parallelism to work as a spark job, it seems like I might as well just run it on a single ec2 instance with the parallel threads.
So the basic logic is this -
Create spark context at top of script
Inside a class - Pull list of files to process from an SQS queue
Loop over list of files with the following method
# this is run for 10 blocks of 10 files each across the EMR cluster in parallel
def parquet_driver(self):
max_threads = 20
futures=[]
pool = ThreadPoolExecutor(max_threads)
i = 0
total_files_processed = 0
while total_files_processed <= len(self.master_file_list):
while i < max_threads:
print('Processing %s' % self.master_file_list[i])
futures.append(pool.submit(self.convert_to_parquet,
self.master_file_list[i]))
i += 1
for x in as_completed(futures):
pass
# add in i number of files to the total
total_files_processed += i
Notice this is passing a file to a method called "convert_to_parquet".
def convert_to_parquet(self, file):
log_file_name = file.split(':')[2].replace('.dat', '.log')
logger = Logger(log_file_name).get()
try:
bucket = s3.Bucket(file.split(':')[0])
file_name = file.split(':')[2]
file_obj = bucket.Object(file.split(':')[1] + '/' + file.split(':')[2])
partition_key = file.split(':')[2].split('.')[2]
target_table = file.split(':')[2].split('.')[1]
receipt_handle = file.split(':')[3]
file_contents = file_obj.get()["Body"].read()
if 'al1' not in file.split(':')[2]:
logger.debug('Record type = %s, deleting from queue and returning ..' % target_table)
else:
logger.debug('Working on %s..' % target_table)
app_name = file
#sc = SparkContext(appName=app_name)
print('Reading the following file from s3: %s' % file_name)
print('Found the following file contents on s3: %s' % file_contents)
rdd = sc.parallelize(file_contents.split('\n')).map(lambda line: line.split(','))
# rdd = sc.textFile(csv_file).map(lambda line: line.split(','))
# pd.read_csv(csv_file)
sqlContext = sql.SQLContext(sc)
if hasattr(rdd, "toDF"):
df = rdd.toDF()
else:
spark = SparkSession
df = rdd.toDF()
logger.debug("Partitioning data to: {0}".format(partition_key))
# Go to redshift and get the data definition
metadata = self.build_df_definition('al1')
if 'cycle_date' in metadata['columns']:
metadata['columns'].remove('cycle_date')
if 'log_timestamp' in metadata['columns']:
metadata['columns'].remove('log_timestamp')
cols = metadata['columns']
data_types = metadata['data_types']
for idx in range(0,len(cols)):
col_str = '_' + str(int(idx) + 1)
df_field_value = regexp_replace(df[col_str], '"', '')
df = df.withColumn(cols[idx],df_field_value.cast(data_types[idx]))
df = df.withColumn("cycle_date",lit(partition_key))
# this field will be pushed to the sqs queue
df = df.withColumn("log_timestamp",lit(self.log_timestamp))
full_cols = cols
full_cols.append('cycle_date')
full_cols.append('log_timestamp')
print(full_cols)
ref_df = df.select(full_cols)
ref_df.show()
partitionby=['year','month','day']
output='/opt/data/' + '/' + target_table
s3_loc = 's3://<bucket>/<prefix>/' + target_table
codec='snappy'
ref_df.write.partitionBy(['cycle_date']).format("parquet").save(s3_loc, mode="append")
#sc.stop()
except Exception as e:
logger.debug(e)
traceback.print_exc()
open("/opt/logs/dump.log","w").write(traceback.print_exc())
exit()
else:
# Delete received message from queue
sqs.delete_message(
QueueUrl=self.queue_url,
ReceiptHandle=receipt_handle
)
logger.debug('Received and deleted file: %s' % file)