How to generate Pyspark dynamic frame name dynamically - python

I have a table which has data as shown in the diagram . I want to create store results in dynamically generated data frame names.
For eg here in the below example I want to create two different data frame name
dnb_df and es_df and store the read result in these two frames and print structure of each data frame
When I am running the below code getting the error
SyntaxError: can't assign to operator (TestGlue2.py, line 66)
import sys
import boto3
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import regexp_replace, col
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
#sc.setLogLevel('DEBUG')
glueContext = GlueContext(sc)
spark = glueContext.spark_session
#logger = glueContext.get_logger()
#logger.DEBUG('Hello Glue')
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
client = boto3.client('glue', region_name='XXXXXX')
response = client.get_connection(Name='XXXXXX')
connection_properties = response['Connection']['ConnectionProperties']
URL = connection_properties['JDBC_CONNECTION_URL']
url_list = URL.split("/")
host = "{}".format(url_list[-2][:-5])
new_host=host.split('#',1)[1]
port = url_list[-2][-4:]
database = "{}".format(url_list[-1])
Oracle_Username = "{}".format(connection_properties['USERNAME'])
Oracle_Password = "{}".format(connection_properties['PASSWORD'])
#print("Oracle_Username:",Oracle_Username)
#print("Oracle_Password:",Oracle_Password)
print("Host:",host)
print("New Host:",new_host)
print("Port:",port)
print("Database:",database)
Oracle_jdbc_url="jdbc:oracle:thin:#//"+new_host+":"+port+"/"+database
print("Oracle_jdbc_url:",Oracle_jdbc_url)
source_df = spark.read.format("jdbc").option("url", Oracle_jdbc_url).option("dbtable", "(select * from schema.table order by VENDOR_EXECUTION_ORDER) ").option("user", Oracle_Username).option("password", Oracle_Password).load()
vendor_data=source_df.collect()
for row in vendor_data :
vendor_query=row.SRC_QUERY
row.VENDOR_NAME+'_df'= spark.read.format("jdbc").option("url",
Oracle_jdbc_url).option("dbtable", vendor_query).option("user",
Oracle_Username).option("password", Oracle_Password).load()
print(row.VENDOR_NAME+'_df')
Added use case in picture

Update: As discussed in the comments, your requirement is to further join all with another dataframe
for row in vendor_data:
rowAsDict=row.asDict()
# Here you can use any variable as rowAsDict is not going to be used anywhere else anyway
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"] = spark.sql(rowAsDict["SOURCE_QUERY"])
main_dataframe=main_dataframe.join(rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"], "acc_id")
Input main_dataframe:
source_df :
View1 and View2:
Output main_dataframe
If I understood correctly, you need to generate the VENDOR_NAME_DF dynamically.
You won't be able to assign to the Row Object, neither it'll be useful to assign dataframe to a Row as you can't create a Dataframe with a column of type Dataframe.
Though, you can convert a row to a dict using asDict and use that instead.
This would work:
vendor_data=source_df.collect()
for row in vendor_data:
rowAsDict=row.asDict()
# Replace this with spark.read() or any way to create a Dataframe
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"] = spark.sql(rowAsDict["SOURCE_QUERY"])
rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"].show()
Input Source_DF:
Result of SOURCE_QUERY:
Output (of rowAsDict[rowAsDict["VENDOR_NAME"]+"_df"].show()):
Final rowAsDict:
{'VENDOR_NAME': 'Name1', 'SOURCE_QUERY': 'select * from view1', 'Name1_df': DataFrame[id: string, date: string, Code: string]}

Add the last two lines in your for loop, you should be able to get the results.
First one is creating a temp table using the dynamic df name
Second is to show the data in that temp table.
for row in vendor_data :
vendor_query=row.SRC_QUERY
spark.read.format("jdbc").option("url",
Oracle_jdbc_url).option("dbtable", vendor_query).option("user",
Oracle_Username).option("password", Oracle_Password).load().createOrReplaceTempView(row.VENDOR_NAME+'_df')
spark.sql("select * from "+row.VENDOR_NAME+"_df").show()

Related

I have 2 json objects in python,For each of the sections in the 2nd object i need to append and nest them into the first. How would i acheive this?

Apologies if this does not make sense but i have explained it the best i can.
I have 2 json objects in python (orderdetails,orderlines) with a common field (order_id)
For each of the {data between the brackets with the same order_id as orderdetails} in orderlines i need to append and nest them into order details so the end result looks like this:
End Result
orderdetails is the large data and the orderlines is the small data.
My data is pulled from SQL and then transformed subsequently to a dataframe and then json.
I've looked at a few questions on this but must of them don't really relate to what i want to do.
Here is the below code i currently have:
from func.excelfunction import *
from datetime import datetime as dt
from datetime import timedelta
import requests
import csv
import os
import math
from sql_server.sql_server import *
import pandas as pd
import json
#variables
dir_path = os.path.dirname(os.path.realpath(__file__))
df1 = run_sql_df('SET NOCOUNT ON; exec isawitfirstdb.dbo.usp_Torque_orderDetails')
df1 = df1.astype({"created_at":"str","order_number":"str","order_id":"int64","instructions":"str","dispatch_method":"str","email":"str","Contact":"str",
"name":"str","address_1":"str","address_2":"str","town":"str","county":"str","postcode":"str","country":"str","invoice_currency":"str",
"subtotal_price":"str","total_price":"str","invoice_name":"str","invoice_contact_phone":"str","invoice_address_1":"str","invoice_address_2":"str",
"invoice_town":"str","invoice_county":"str","invoice_postcode":"str","inovice_country":"str","Owner_id":"str","Merge_status":"str","Merge_action":"str","Record_type":"str"})
df2 = run_sql_df('SET NOCOUNT ON; exec isawitfirstdb.dbo.usp_Torque_orderLineDetails')
df2 = df2.astype({"sku_id":"str","order_id":"int64","qty_ordered":"int64","user_def_type_1":"str","user_def_type_2":"str","user_def_num_1":"str","line_value":"int64","config_id":"str","Merge_status":"str","Merge_action":"str","Record_type":"str"})
#convert to dataframe
orders = pd.DataFrame(df1)
orderlines = pd.DataFrame(df2)
#convert to dataframe to json
orders = orders.to_json(orient="records")
orderlines = orderlines.to_json(orient="records")
# JSON
df = orders + orderlines
print(df)
Use some_dict.update({'some_exclusive_key': some_object}) to append to a dict
I assume only one record exists for each order_id in orders but multiple can exist in orderlines. If so, you can iterate through the list of orders and, for each record, append a new field with the list of orderlines that match that order_id. You can get that list using filter()
new_list = []
for rec in orders:
order_id = rec.get('order_id')
matching_orderlines = list(filter(lambda ol: ol['order_id'] == order_id, orderlines))
new_list.append({*rec,'lines':matching_orderlines})
I am also assuming that the key order_id always exists. if not, replace brackets by .get(). I din't test the code above...

How to filter spark dataframe with URL exists or not?

I want to filter my spark dataframe. In this dataframe, there is an col of URL.
I have tried to use os.path.exists(col("url")) to filter my dataframe, but I got errors like
"string is needed, but column has been found".
here is part of my code, pandas has been used in codes, and now i want to use spark to implement the following code
bob_ross = pd.DataFrame.from_csv("/dbfs/mnt/umsi-data-science/si618wn2017/bob_ross.csv")
bob_ross['image'] = ""
# create a column for each of the 85 colors (these will be c0...c84)
# we'll do this in a separate table for now and then merge
cols = ['c%s'%i for i in np.arange(0,85)]
colors = pd.DataFrame(columns=cols)
colors['EPISODE'] = bob_ross.index.values
colors = colors.set_index('EPISODE')
# figure out if we have the image or not, we don't have a complete set
for s in bob_ross.index.values:
b = bob_ross.loc[s]['TITLE']
b = b.lower()
b = re.sub(r'[^a-z0-9\s]', '',b)
b = re.sub(r'\s', '_',b)
img = b+".png"
if (os.path.exists("/dbfs/mnt/umsi-data-science/si618wn2017/images/"+img)):
bob_ross.set_value(s,"image","/dbfs/mnt/umsi-data-science/si618wn2017/images/"+img)
t = getColors("/dbfs/mnt/umsi-data-science/si618wn2017/images/"+img)
colors.loc[s] = t
bob_ross = bob_ross.join(colors)
bob_ross = bob_ross[bob_ross.image != ""]
here is how i try to implement it with spark, i am stuck at the error line
from pyspark.sql.functions import *
bob_ross = spark.read.csv('/mnt/umsi-data-science/si618wn2017/bob_ross.csv',header=True)
bob_ross=bob_ross.withColumn("image",concat(lit("/dbfs/mnt/umsi-data-science/si618wn2017/images/"),concat(regexp_replace(regexp_replace(lower(col('TITLE')),r'[^a-z0-9\s]',''),r'\s','_'),lit(".png"))))
#error line ---filter----
bob_ross.filter(os.path.exists(col("image")))
print(bob_ross.head())
You should be using filter function, not an OS function
For example
df.filter("image is not NULL")
os.path.exists only operates on the local filesystem, while Spark is meant to run on many servers, so that should be a sign you're not using the correct function

Cannot create a dataframe in pyspark and write it to Hive table

I am trying to create a dataframe in pyspark, then write it as a Hive table, and then read it back, but it is not working...
sqlContext = HiveContext(sc)
hive_context = HiveContext(sc) #Initialize Hive
#load the control table
cntl_dt = [('2016-04-30')]
rdd = sc.parallelize(cntl_dt)
row_cntl_dt = rdd.map(lambda x: Row(load_dt=x[0]))
df_cntl_dt = sqlContext.createDataFrame(row_cntl_dt)
df_cntl_dt.write.mode("overwrite").saveAsTable("schema.cntrl_tbl")
load_dt = hive_context.sql("select load_dt from schema.cntrl_tbl" ).first()['load_dt'];
print (load_dt)
Prints: 2
I expect :2016-12-31
This is because:
cntl_dt = [('2016-04-30')]
is not a valid syntax for a single element tuple. Quotes will be ignored and result will be the same as:
['2016-04-30']
and
Row(load_dt=x[0])
will give:
Row(load_dt='2')
Use:
cntl_dt = [('2016-04-30', )]
Also you're mixing different context (SQLContext and HiveContext) which is generally a bad idea (and both shouldn't be used in any recent Spark version)

How to map structured data to schemaRDD in Spark?

I've asked this question differently before but there are some changes so I thought asking it again as a new question.
I have a structured data which only part of it is in json format but I need to map the entire data to an schemaRDD. The data looks like this:
03052015 04:13:20
{"recordType":"NEW","data":{"keycol":"val1","col2":"val2","col3":"val3"}
Each line starts with date followed by time and a json formatted text.
I need to map not only the text in json but also the date and time into the same structure.
I tried it in Python but obviously it doesn't work because Row does not take an RDD (jsonRDD in this case).
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
orderFile = sc.textFile(myfile)
orderLine = orderFile.map(lambda line: line.split(" ", 2))
anotherOrderLine = orderLine.map(lambda p: Row(date=p[0], time=p[1], content=sqlContext.jsonRDD(p[3])))
schemaOrder = sqlContext.inferSchema(anotherOrderLine)
schemaOrder.printSchema()
for x in schemaOrder.collect():
print x
The goal is to be able to run a query like this against the schemaRDD:
select date, time, data.keycol, data.val1, data.val2, data.val3 from myOrder
How can I map the entire line to a schemaRDD?
Any help is appreciated?
The simplest option would be to add this field to JSON and use jsonRDD
My data:
03052015 04:13:20 {"recordType":"NEW","data":{"keycol":"val1","col1":"val5","col2":"val3"}}
03062015 04:13:20 {"recordType":"NEW1","data":{"keycol":"val2","col1":"val6","col2":"val3"}}
03072015 04:13:20 {"recordType":"NEW2","data":{"keycol":"val3","col1":"val7","col2":"val3"}}
03082015 04:13:20 {"recordType":"NEW3","data":{"keycol":"val4","col1":"val8","col2":"val3"}}
Code:
import json
def transform(data):
ts = data[:18].strip()
jss = data[18:].strip()
jsj = json.loads(jss)
jsj['ts'] = ts
return json.dumps(jsj)
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
rdd = sc.textFile('/sparkdemo/sample.data')
tbl = sqlContext.jsonRDD(rdd.map(transform))
tbl.registerTempTable("myOrder")
sqlContext.sql("select ts, recordType, data.keycol, data.col1, data.col2 data from myOrder").collect()
Result:
[Row(ts=u'03052015 04:13:20', recordType=u'NEW', keycol=u'val1', col1=u'val5', data=u'val3'), Row(ts=u'03062015 04:13:20', recordType=u'NEW1', keycol=u'val2', col1=u'val6', data=u'val3'), Row(ts=u'03072015 04:13:20', recordType=u'NEW2', keycol=u'val3', col1=u'val7', data=u'val3'), Row(ts=u'03082015 04:13:20', recordType=u'NEW3', keycol=u'val4', col1=u'val8', data=u'val3')]
In your code there is a problem that you are calling jsonRDD for each of the rows, this is not correct - it accepts an RDD and returns SchemaRDD.
The sqlContext.jsonRDD creates a schema rdd from an RDD containing strings where each string contains a JSON representation. This code sample is from the SparkSQL documentation (https://spark.apache.org/docs/1.2.0/sql-programming-guide.html):
val anotherPeopleRDD = sc.parallelize("""{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
val anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
One of the cool things about jsonRDD is the fact that you can provide and additional parameter stating the JSONs schema, which should improve performance your performance. This can be don by creating an schemaRDD (just load a sample document) and then call the schemaRDD.schema method to get the schema.

cant perform 2 succesive groupBy in spark

I am working with Spark on python.
My problem is: i have a .csv file which contains some data (int1, int2, int3, date). I did a groupByKey on int1. Now I want to perform an other groupBy on my date with the rdd create by the first groupBy.
Problem is I can't perform it. Any idea?
Regards
EDIT2:
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
csvById12Rdd=file.map(lambda (id1,id2,value): ((id1,id2),value)).groupByKey()
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value in values:
print("Id1:{} Id2:{} Value:{}".format(id1,id2,value))
csvById12Rdd.first().foreach(printit)
the csv is like
31705,48,2,2014-10-28T18:14:09.000Z
EDIT 3:
i can print my iterator data with this code
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
def go_in_rdd2(x):
print x[0]
for i in x[1]:
print i
counts = file.map(lambda line: (line.split(",")[0],line.split(",")[1:]))
counts = counts.groupByKey()
counts.foreach(go_in_rdd2)
but i still cant groupBy
Group by return an RDD of (Key, Iterable[Value]), can you do the otherway round?
Group by id1 and id2 and get an RDD of ((Id1,Id2), Iterable[Value])
Then group by id1 alone and get an RDD of (Id1, Iterable[(Id2,Iterable[Value])])
Something like:
csv=[(1,1,"One","Un"),(1,2,"Two","Deux"),(2,1,"Three","Trois"),(2,1,"Four","Quatre")]
csvRdd=sc.parallelize(csv)
# Step 1
csvById12Rdd=csvRdd.map(lambda (id1,id2,value1,value2): ((id1,id2),(value1,value2))).groupByKey()
# Step 2
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
# Print
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value1,value2 in values:
print("Id1:{} Id2:{} Values:{} {}".format(id1,id2,value1,value2))
csvById1Rdd.foreach(printit)

Categories