How to Speed-Up Writing Dataframe to s3 from EMR PySpark Notebook? - python

So I'm learning PySpark by playing around with the DMOZ dataset in a jupyter notebook attached to an EMR cluster. The process I'm trying to achieve is as follows:
Load a csv with the location of files in an s3 public dataset in to a PySpark DataFrame (~130k rows)
Map over the DF with a function that retrieves the file contents (html) and rips the text
Join the output with the original DF as a new column
Write the joined DF to s3 (the problem: It seems to hang forever, its not a large job and the output json should only be a few gigs)
All of the writing is done in a function called run_job()
I let it sit for about 2 hours on a cluster with 10 m5.8xlarge instances which should be enough (?). All of the other steps execute fine on their own, except for the df.write(). I have tested on a
much smaller subset and it wrote to s3 with no issue, but when I go to do the whole file it seemingly hangs at at "0/n jobs complete."
I am new to PySpark and distributed computing in general, so its probably a simple "best practice" that I am missing. (Edit: Maybe its in the config of the notebook? I'm not using any magics to configure spark currently, do I need to?)
Code below...
import html2text
import boto3
import botocore
import os
import re
import zlib
import gzip
from bs4 import BeautifulSoup as bs
from bs4 import Comment
# from pyspark import SparkContext, SparkConf
# from pyspark.sql import SQLContext, SparkSession
# from pyspark.sql.types import StructType, StructField, StringType, LongType
import logging
def load_index():
input_file='s3://cc-stuff/uploads/DMOZ_bussineses_ccindex.csv'
df = spark.read.option("header",True) \
.csv(input_file)
#df = df.select('url_surtkey','warc_filename', 'warc_record_offset', 'warc_record_length','content_charset','content_languages','fetch_time','fetch_status','content_mime_type')
return df
def process_warcs(id_,iterator):
html_textract = html2text.HTML2Text()
html_textract.ignore_links = True
html_textract.ignore_images = True
no_sign_request = botocore.client.Config(signature_version=botocore.UNSIGNED)
s3client = boto3.client('s3', config=no_sign_request)
text = None
s3pattern = re.compile('^s3://([^/]+)/(.+)')
PREFIX = "s3://commoncrawl/"
for row in iterator:
try:
start_byte = int(row['warc_record_offset'])
stop_byte = (start_byte + int(row['warc_record_length']))
s3match = s3pattern.match((PREFIX + row['warc_filename']))
bucketname = s3match.group(1)
path = s3match.group(2)
#print('Bucketname: ',bucketname,'\nPath: ',path)
resp = s3client.get_object(Bucket=bucketname, Key=path, Range='bytes={}-{}'.format(start_byte, stop_byte))
content = resp['Body'].read()#.decode()
data = zlib.decompress(content, wbits = zlib.MAX_WBITS | 16).decode('utf-8',errors='ignore')
data = data.split('\r\n\r\n',2)[2]
soup = bs(data,'html.parser')
for x in soup.findAll(text=lambda text:isinstance(text, Comment)):
x.extract()
for x in soup.find_all(["head","script","button","form","noscript","style"]):
x.decompose()
text = html_textract.handle(str(soup))
except Exception as e:
pass
yield (id_,text)
def run_job(write_out=True):
df = load_index()
df2 = df.rdd.repartition(200).mapPartitionsWithIndex(process_warcs).toDF()
df2 = df2.withColumnRenamed('_1','idx').withColumnRenamed('_2','page_md')
df = df.join(df2.select('page_md'))
if write_out:
output = "s3://cc-stuff/emr-out/DMOZ_bussineses_ccHTML"
df.coalesce(4).write.json(output)
return df
df = run_job(write_out=True)

So I managed to make it work. I attribute this to either of the 2 changes below. I also changed the hardware configuration and opted for a higher quantity of smaller instances. Gosh I just LOVE it when I spend an entire day in a deep state of utter confusion when all I needed to do was add an "/" to the save location......
I added a trailing "/" to the output file location in s3
1 Old:
output = "s3://cc-stuff/emr-out/DMOZ_bussineses_ccHTML"
1 New:
output = "s3://cc-stuff/emr-out/DMOZ_bussineses_ccHTML/"
I removed the "coalesce" in the "run_job()" function, I have 200 output files now, but it worked and it was super quick (under 1 min).
2 Old:
df.coalesce(4).write.json(output)
2 New:
df.write.mode('overwrite').json(output)

Related

A DataFrame object does not have an attribute select

In palantir foundry, I am trying to read all xml files from a dataset. Then, in a for loop, I parse the xml files.
Until the second last line, the code runs fine without errors.
from transforms.api import transform, Input, Output
from transforms.verbs.dataframes import sanitize_schema_for_parquet
from bs4 import BeautifulSoup
import pandas as pd
import lxml
#transform(
output=Output("/Spring/xx/datasets/mydataset2"),
source_df=Input("ri.foundry.main.dataset.123"),
)
def read_xml(ctx, source_df, output):
df = pd.DataFrame()
filesystem = source_df.filesystem()
hadoop_path = filesystem.hadoop_path
files = [f"{hadoop_path}/{f.path}" for f in filesystem.ls()]
for i in files:
with open(i, 'r') as f:
file = f.read()
soup = BeautifulSoup(file,'xml')
data = []
for e in soup.select('offer'):
data.append({
'meldezeitraum': e.find_previous('data').get('meldezeitraum'),
'id':e.get('id'),
'parent_id':e.get('parent_id'),
})
df = df.append(data)
output.write_dataframe(sanitize_schema_for_parquet(df))
However, as soon as I add the last line:
output.write_dataframe(sanitize_schema_for_parquet(df))
I get this error:
Missing transform attribute
A DataFrame object does not have an attribute select. Please check the spelling and/or the datatype of the object.
/transforms-python/src/myproject/datasets/mydataset.py
output.write_dataframe(sanitize_schema_for_parquet(df))
What am I doing wrong?
You have to convert your pandas DataFrame to a spark DataFrame. Even though they have the same name those are two different object types in python.
The easiest way to do that is
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_spark = spark.createDataFrame(df)
You can then pass the spark_df to the output.write_dataframe() function

Approch to merge a template with header and Items with Data for each entry

I'm trying to learn Python and find a solution for my business.
I'm working on SAP and i need to merge data to fill a template.
Doing the merge based on Excel VBA, it's working but to fill a file with 10 K entries it's take a very long time.
My template is avaiable here
https://docs.google.com/spreadsheets/d/1FXc-4zUYx0fjGRvPf0FgMjeTm9nXVfSt/edit?usp=sharing&ouid=113964169462465283497&rtpof=true&sd=true
And a sample of data is here
https://drive.google.com/file/d/105FP8ti0xKbXCFeA2o5HU7d2l3Qi-JqJ/view?usp=sharing
So I need to merge for each record from my data file into the Excel template where we have an header and 2 lines (it's a FI posting so I need to fill the debit and credit.
In VBA, I have proceed like that:
Fix the cell:
Copy data from the template with function activecell.offset(x,y) ...
From my Data file fill the different record based on technical name.
Now I'm trying the same in Python.
Using Pandas or openpyxyl I can open the file but I can't see how can I continue or proceed to find a way to merge header data (must be copy for eache posting I have to book) and data.
from tkinter import *
import pandas as pd
import datetime
from openpyxl import load_workbook
import numpy as np
def sap_line_item(ligne):
ledger = ligne
print(ligne)
return
# Constante
c_dir = '/Users/sapfinance/PycharmProjects/SAP'
C_FILE_SEP = ';'
root = Tk()
root.withdraw()
# folder_selected = filedialog.askdirectory(initialdir=c_dir)
fiori_selected = filedialog.askopenfile(initialdir=c_dir)
data_selected = filedialog.askopenfile(initialdir=c_dir)
# read data
pd.options.display.float_format = '{:,.2f}'.format
fichier_cible = str(data_selected.name)
target_filename = fichier_cible + '_' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.xlsx'
# target = pd.ExcelWriter(target_filename, engine='xlsxwriter')
df_full_data = pd.read_csv(data_selected.name, sep=C_FILE_SEP, encoding='unicode_escape', dtype='unicode')
nb_ligne_data = int(len(df_full_data))
print(nb_ligne_data)
#df_fiori = pd.read_excel(fiori_selected.name)
print(fiori_selected.name)
df_fiori = load_workbook(fiori_selected.name)
df_fiori_data = df_fiori.active
Any help to give some tick to approach and find a solution will be appreciate.
Have a great day
Philippe

Trouble merging dask dataframes

I have several .pcap files whose data I want write to one large dask data frame. Currently, initializes a dask data frame using data from the first file. It then is supposed to process the rest of the pcap files and add to that dask data frame using merge/concat. However, when I check the number of the rows of the merged dask dataframe it doesn't increase. What is happening?
I also am not sure if I am using the right approach for my use case. I am trying to convert my entire dataset into a giant dask dataframe and write it out to h5 file. My computer doesn't have enough memory to load the entire dataset so that's why I'm using dask. The idea is to load the dask dataframe that contains the entire dataset so I could do operations on the entire dataset. I'm new to dask and I've read over the some of the documentation but I'm still fuzzy about how dasks handles loading data from disk instead of memory. I'm also fuzzy about how partitions work in dask. Specifically, I'm also not sure how chunksize differs from partitions so I'm having trouble properly partitioning this dataframe. Any tips and advice would be helpful.
As said before, I've read over the main parts of the documentation.
I've tried using the dd.merge(dask_df, panda_df) as shown in the documentation. When I initialize the dask dataframe, it starts with 6 rows. When I use merge the row count decreases to 1
I've also tried using concat. Again, I have a count of 6 rows during initialization. However, after the concat operations the row count still remains at 6. I would expect the row count to increase.
Here is the initialization function
import os
import sys
import h5py
import pandas as pd
import dask.dataframe as dd
import gc
import pprint
from scapy.all import *
flags = {
'R': 0,
'A': 1,
'S': 2,
'DF':3,
'FA':4,
'SA':5,
'RA':6,
'PA':7,
'FPA':8
}
def initialize(file):
global flags
data = {
'time_delta': [0],
'ttl':[],
'len':[],
'dataofs':[],
'window':[],
'seq_delta':[0],
'ack_delta':[0],
'flags':[]
}
scap = sniff(offline=file,filter='tcp and ip')
for packet in range(0,len(scap)):
pkt = scap[packet]
flag = flags[str(pkt['TCP'].flags)]
data['ttl'].append(pkt['IP'].ttl)
data['len'].append(pkt['IP'].len)
data['dataofs'].append(pkt['TCP'].dataofs)
data['window'].append(pkt['TCP'].window)
data['flags'].append(flag)
if packet != 0:
lst_pkt = scap[packet-1]
data['time_delta'].append(pkt.time - lst_pkt.time)
data['seq_delta'].append(pkt['TCP'].seq - lst_pkt['TCP'].seq)
data['ack_delta'].append(pkt['TCP'].ack - lst_pkt['TCP'].ack)
panda = pd.DataFrame(data=data)
panda['ttl']=panda['ttl'].astype('float16')
panda['flags']=panda['flags'].astype('float16')
panda['dataofs']=panda['dataofs'].astype('float16')
panda['len']=panda['len'].astype('float16')
panda['window']=panda['window'].astype('float32')
panda['seq_delta']=panda['seq_delta'].astype('float32')
panda['ack_delta']=panda['ack_delta'].astype('float32')
df =dd.from_pandas(panda,npartitions=6)
gc.collect()
return df
Here is the concatenation function
def process(file):
global flags
global df
data = {
'time_delta': [0],
'ttl':[],
'len':[],
'dataofs':[],
'window':[],
'seq_delta':[0],
'ack_delta':[0],
'flags':[]
}
scap = sniff(offline=file,filter='tcp and ip')
for packet in range(0,len(scap)):
pkt = scap[packet]
flag = flags[str(pkt['TCP'].flags)]
data['ttl'].append(pkt['IP'].ttl)
data['len'].append(pkt['IP'].len)
data['dataofs'].append(pkt['TCP'].dataofs)
data['window'].append(pkt['TCP'].window)
data['flags'].append(flag)
if packet != 0:
lst_pkt = scap[packet-1]
data['time_delta'].append(pkt.time - lst_pkt.time)
data['seq_delta'].append(pkt['TCP'].seq - lst_pkt['TCP'].seq)
data['ack_delta'].append(pkt['TCP'].ack - lst_pkt['TCP'].ack)
panda = pd.DataFrame(data=data)
panda['ttl']=panda['ttl'].astype('float16')
panda['flags']=panda['flags'].astype('float16')
panda['dataofs']=panda['dataofs'].astype('float16')
panda['len']=panda['len'].astype('float16')
panda['window']=panda['window'].astype('float32')
panda['seq_delta']=panda['seq_delta'].astype('float32')
panda['ack_delta']=panda['ack_delta'].astype('float32')
#merge version dd.merge(df, panda)
dd.concat([df,dd.from_pandas(panda,npartitions=6)])
gc.collect()
And here is the main program
directory = 'dev/streams/'
files = os.listdir(directory)
df = initialize(directory+files[0])
files.remove(files[0])
for file in files:
process(directory+file)
print(len(df))
using merge:
print(len(df)) = 1
using concat:
print(len(df))=6
expected:
print(len(df)) > 10,000
Try explicitly returning df as the result of your dask concat:
df = dd.concat([df, dd.from_pandas(panda,npartitions=6)])
And don't duplicate the exact same blocks of code but encaspulate them in another function:
def process_panda(file_wpath, flags):
data = {
[...]
panda['ack_delta']=panda['ack_delta'].astype('float32')
return panda
Then you just have to test if the file to process is the first, so your main code becomes:
import os
import sys
import h5py
import pandas as pd
import dask.dataframe as dd
import gc
import pprint
from scapy.all import *
flags = {
'R': 0,
'A': 1,
'S': 2,
'DF':3,
'FA':4,
'SA':5,
'RA':6,
'PA':7,
'FPA':8
}
directory = 'dev/streams/'
files = os.listdir(directory)
for file in files:
file_wpath = os.path.join(directory, file)
panda = process_panda(file_wpath, flags)
if file == files[0]:
df = dd.from_pandas(panda, npartitions=6)
else:
df = dd.concat([df, dd.from_pandas(panda, npartitions=6)])
gc.collect()
print(len(df))

Call external script

I have a long list of pandas transformation commands that I need to run against a pandas DataFrame:
pd['newvar_A'] = pd['somevar'] * pd['somevar']
pd['newvar_C'] = pd['somevar'] * pd['somevar']
pd['newvar_D'] = pd['somevar'] * pd['somevar']
pd['newvar_ETC'] = pd['somevar'] * pd['somevar']
It's a long list (about 150 lines). Is it possible to include this as a separate script called transformations.py in an already existing script? The idea is to keep the main script simple, so my idea is the script to look like this:
import pandas as pd
pd.read_csv ('data.csv')
...
#Run transformations
insert file = "transformations.py"
...
#rest of the main script
Is there a Python command to call another Python script (assuming this script is located in the same folder as the working directory)?
Thanks!
You can try to "import" the script as it's the best way as per this post
A small example
sample.csv
name,age
sharon,12
shalom,10
The script which I am going to import
nameChange.py
import pandas as pd
# transform the csv file
data = pd.read_csv('sample.csv')
data.iloc[0,0] = 'justin'
data.to_csv('sample.csv',index = False)
The main code
stackoverflow.py
import pandas as pd
# before transform
data = pd.read_csv('sample.csv')
print(data)
# call the script
import nameChange
# do the work after the script runs
transformed_data = pd.read_csv('sample.csv')
print(transformed_data)
Output
name age
0 sharon 12
1 shalom 10
name age
0 justin 12
1 shalom 10
To run the above code without modifying the original csv
The script which I am going to import
nameChange.py
import pandas as pd
import pickle
# transform the csv file variable which was saved by stackoverflow.py
data = pickle.load(open('data.sav','rb'))
data.iloc[0,0] = 'justin'
# saving the df
pickle.dump(data,open('data.sav','wb'))
The main code
stackoverflow.py
import pandas as pd
import pickle
# before transform
data = pd.read_csv('sample.csv')
print(data)
pickle.dump(data,open('data.sav','wb'))
# call the script
import nameChange
transformed_data = pickle.load(open('data.sav','rb'))
# do the work after the script runs
print(transformed_data)

Python with very nested JSON to CSV file

https://www.eex.com/data//view/data/detail/phelix-power-futures/2018/02.27.json
I have changed the script following Stev's answer. The error no longer applies.
#import pandas as pd
import requests
import json
import csv
outfile = open('D:\\test.csv','w')
url = 'https://www.eex.com/data//view/data/detail/phelix-power-futures/2018/02.27.json'
resp = requests.get(url)
data = json.loads(resp.content.decode('UTF8'))
for d in data['data']:
for r in d['rows']:
for sd in (d['rows']):
for td in (sd['data']):
dparsed = sd['data']
w = csv.DictWriter(outfile, dparsed.keys())
w.writeheader()
w.writerow(dparsed)
I ran the script and it created the csv file, but it is showing 0 KB and is saying it is locked by another user so I don't know exactly what I have goofed up this time. This is clearly not a duplicate question, so thanks for flagging it as such... /s
I ran the above script and after about 3 hours of waiting I killed spyder to see what happened with the excel file. It kind of worked, but it only managed to spit out some of the data into columns and about like 3 rows. Not sure where I fell down yet.
This is more of a comment because it doesn't give you the answer, but I am not sure you json file is formatted properly in order to use pd.json_normalize. You might have to loop through your json file, using something like the following:
import pandas as pd
import requests
import json
url = 'https://www.eex.com/data//view/data/detail/phelix-power-futures/2018/02.27.json'
resp = requests.get(url)
data = json.loads(resp.content.decode('UTF8'))
df1 = pd.DataFrame()
df2 = pd.DataFrame()
for d in data['data']:
# print(d['identifier'])
for r in d['rows']:
# print(r['contractIdentifier'])
# print(r['data'])
df1 = df1.append(json_normalize(r['data']))
df2 = df2.append(pd.DataFrame([r['contractIdentifier']]))
# print(r)
df = pd.concat([df1,df2], axis=1)
df.to_csv('my_file.txt')

Categories