PySpark/HIVE: append to an existing table - python

Really basic question pyspark/hive question:
How do I append to an existing table? My attempt is below
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
conf_init = SparkConf().setAppName('pyspark2')
sc = SparkContext(conf = conf_init)
hive_cxt = HiveContext(sc)
import pandas as pd
df = pd.DataFrame({'a':[0,0], 'b':[0,0]})
sdf = hive_cxt.createDataFrame(df)
sdf.write.mode('overwrite').saveAsTable('database.table') #this line works
df = pd.DataFrame({'a':[1,1,1], 'b':[2,2,2]})
sdf = hive_cxt.createDataFrame(df)
sdf.write.mode('append').saveAsTable('database.table') #this line does not work
#sdf.write.insertInto('database.table',overwrite = False) #this line does not work
Thanks!
Sam

It seems using option('overwrite') was causing the problem; it drops the table and then recreates a new one. If I do the following, everything works fine:
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
conf_init = SparkConf().setAppName('pyspark2')
sc = SparkContext(conf = conf_init)
print(sc.version)
hive_cxt = HiveContext(sc)
hive_cxt.sql('USE database')
query = """
CREATE TABLE IF NOT EXISTS table (a int, b int)
STORED AS parquet
"""
hive_cxt.sql(query)
import pandas as pd
df = pd.DataFrame({'a':[0,0], 'b':[0,0]})
sdf = hive_cxt.createDataFrame(df)
sdf.write.mode('append').format('hive').saveAsTable('table')
query = """
SELECT *
FROM table
"""
df = hive_cxt.sql(query)
df = df.toPandas()
print(df) # successfully pull the data in table
df = pd.DataFrame({'a':[1,1,1], 'b':[2,2,2]})
sdf = hive_cxt.createDataFrame(df)
sdf.write.mode('append').format('hive').saveAsTable('table')

I think previously you forgot use the format option which caused the issue for you when you are trying to append and not overwrite like you mentioned above.

Related

How do I compile and bring in multiple outputs from the same worker?

I'm developing a kubeflow pipeline that takes in a data set, splits that dataset into two different datasets based on a filter inside the code, and outputs both datasets. That function looks like the following:
def merge_promo_sales(input_data: Input[Dataset],
output_data_hd: OutputPath("Dataset"),
output_data_shop: OutputPath("Dataset")):
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
import numpy as np
from google.cloud import bigquery
from utils import google_bucket
client = bigquery.Client("gcp-sc-demand-plan-analytics")
print("Client creating using default project: {}".format(client.project), "Pulling Data")
query = """
SELECT * FROM `gcp-sc-demand-plan-analytics.Modeling_Input.monthly_delivery_type_sales` a
Left Join `gcp-sc-demand-plan-analytics.Modeling_Input.monthly_promotion` b
on a.ship_base7 = b.item_no
and a.oper_cntry_id = b.corp_cd
and a.dmand_mo_yr = b.dates
"""
query_job = client.query(
query,
# Location must match that of the dataset(s) referenced in the query.
location="US",
) # API request - starts the query
df = query_job.to_dataframe()
df.drop(['corp_cd', 'item_no', 'dates'], axis = 1, inplace=True)
df.loc[:, 'promo_objective_increase_margin':] = df.loc[:, 'promo_objective_increase_margin':].fillna(0)
items = df_['ship_base7'].unique()
df = df[df['ship_base7'].isin(items)]
df_hd = df[df['location_type'] == 'home_delivery']
df_shop = df[df['location_type'] != 'home_delivery']
df_hd.to_pickle(output_data_hd)
df_shop.to_pickle(output_data_shop)
That part works fine. When I try to feed those two data sets into the next function with the compiler, I hit errors.
I tried the following:
#kfp.v2.dsl.pipeline(name=PIPELINE_NAME)
def my_pipeline():
merge_promo_sales_nl = merge_promo_sales(input_data = new_launch.output)
rule_3_hd = rule_3(input_data = merge_promo_sales_nl.output_data_hd)
rule_3_shop = rule_3(input_data = merge_promo_sales_nl.output_data_shop)`
The error I get is the following:
AttributeError: 'ContainerOp' object has no attribute 'output_data_hd'
output_data_hd is the parameter I put that dataset out to but apparently it's not the name of parameter kubeflow is looking for.
I just figured this out.
When you run multiple outputs, you use the following in the compile section:
rule_3_hd = rule_3(input_data = merge_promo_sales_nl.outputs['output_data_hd'])
rule_3_shop = rule_3(input_data = merge_promo_sales_nl.outputs['output_data_shop'])

How to sort S3 CSV File using AWS GLUE

I'm relatively new to AWS glue and spark. I'd like to sort a csv file by user ID in S3. I'm trying out the script below, but it's not sorting the file.Can someone please help me in this?
import sys
import math
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import current_date
import pyspark.sql.functions as f
from pyspark.sql.functions import asc
args = getResolvedOptions(sys.argv, ['JOB_NAME','DESTINATION_PATH', 'SOURCE_PATH'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
dyf = glueContext.create_dynamic_frame.from_options("s3", connection_options = {"paths": [args['SOURCE_PATH']]}, format="csv", format_options = {"withHeader": True});
print("records read from s3 store")
print(dyf.count())
file_size = 10000
n_partitions = int(math.ceil(dyf.count() / float(file_size)))
print("splitting file into partitions")
print(n_partitions)
sort_dataframe = dyf.toDF().orderBy("user_id")
print(sort_dataframe.show())
df_dataframe = sort_dataframe.repartition(n_partitions)
ddf_dataframe = DynamicFrame.fromDF(sort_dataframe, glueContext, "ddf_dataframe")
datasink4 = glueContext.write_dynamic_frame.from_options(frame = ddf_dataframe, connection_type = "s3", format = "csv", connection_options = {"path": args['DESTINATION_PATH']}, transformation_ctx = "datasink4",format_options = {"withHeader": True})
print("records processing complete")
job.commit()
You are sorting it, then immediately shuffling everything randomly to other partitions by repartitioning. Do a dyf.toDF().repartition(n_partitions).sortWithinPartitions("user_id"). You will get the full range of user ids in each file, but within each file every row is sorted by user id.
If you are querying by athena that is actually good as you can query all files in parallel but the query will be able to zoom in on just the portion of the file with the user ids you are filtering by very quickly (if you are using parquet at least).
If that is not suitable try dyf.toDF().repartitionByRange(n_partitions, "user_id"). That will require it to sample the user_id and make an educated guess at how to distribute the user ids between files, therefore the files may not be perfectly evenly sized, but each file will have a different set of user ids and no files will have overlapping ranges of user ids.

Converting rdd to dataframe: AttributeError: 'RDD' object has no attribute 'toDF' using PySpark

I am trying to convert the RDD to DataFrame using PySpark. Below is my code.
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
conf = SparkConf().setMaster("local").setAppName("Dataframe_examples")
sc = SparkContext(conf=conf)
def parsedLine(line):
fields = line.split(',')
movieId = fields[0]
movieName = fields[1]
genres = fields[2]
return movieId, movieName, genres
movies = sc.textFile("file:///home/ajit/ml-25m/movies.csv")
parsedLines = movies.map(parsedLine)
print(parsedLines.count())
dataFrame = parsedLines.toDF(["movieId"])
dataFrame.printSchema()
I am running this code using PyCharm IDE.
And I get the error:
File "/home/ajit/PycharmProjects/pythonProject/Dataframe_examples.py", line 19, in <module>
dataFrame = parsedLines.toDF(["movieId"])
AttributeError: 'PipelinedRDD' object has no attribute 'toDF'
As I am new to this, let me know what am I missing?
Initialize SparkSession by passing sparkcontext.
Example:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
conf = SparkConf().setMaster("local").setAppName("Dataframe_examples")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
def parsedLine(line):
fields = line.split(',')
movieId = fields[0]
movieName = fields[1]
genres = fields[2]
return movieId, movieName, genres
movies = sc.textFile("file:///home/ajit/ml-25m/movies.csv")
#or using spark.sparkContext
movies = spark.sparkContext.textFile("file:///home/ajit/ml-25m/movies.csv")
parsedLines = movies.map(parsedLine)
print(parsedLines.count())
dataFrame = parsedLines.toDF(["movieId"])
dataFrame.printSchema()
Use SparkSession to make the RDD dataframe as follows:
movies = sc.textFile("file:///home/ajit/ml-25m/movies.csv")
parsedLines = movies.map(parsedLine)
print(parsedLines.count())
spark = SparkSession.builder.getOrCreate()
dataFrame = spark.createDataFrame(parsedLines).toDF(["movieId"])
dataFrame.printSchema()
or use the spark context from the session at first.
spark = SparkSession.builder.master("local").appName("Dataframe_examples").getOrCreate()
sc = spark.sparkContext

pyspark context issue: AttributeError: type object 'SparkContext' has no attribute '_jsc'?

import os
import pandas as pd
from pyspark.sql import SQLContext
from pyspark import SparkContext as sc
import pyarrow
os.chdir(r'C:\2020\EYAIA\work\slack')
master_path = r'PRP.xlsx'
other_path = r'Book1.xlsx'
output = 'merged.xlsx'
sqlContext = SQLContext(sc)
master = pd.read_excel(master_path)
master.to_csv('csvfile.csv', encoding='utf-8', index=False)
df = sqlContext.read.load('csvfile.csv', format='com.databricks.spark.csv', header='true', inferSchema='true')
print(df)
Getting
AttributeError: type object 'SparkContext' has no attribute '_jsc'
What could be the issue?
I am trying from pycharm.
create spark session object and then read data as:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()
df = spark.read.csv('path/to/csv',header=True, inferSchema=True)
print(df.show())

How to use unbase64 function in pyspark SQL query?

I cannot seem to figure out why unbase64 function won't work in my Spark SQL query.
Here is an example. I'm trying to decode "VGhpcyBpcyBhIHRlc3Qh" by calling the unbase64 function within the spark SQL. Any thoughts on why the output doesn't get decoded? Thanks.
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import unbase64
sc = SparkContext("local", "Simple App")
sqlContext = SQLContext(sc)
log = [{"eventTime":"2015-12-14 15:27:00","id":"9ab0135f-b8a3-4312-9065-9f8874fd790c","fullLog":"VGhpcyBpcyBhIHRlc3Qh"}]
df = sqlContext.createDataFrame(log)
df.registerTempTable('data')
query = sqlContext.sql('SELECT unbase64(fullLog) as test FROM data')
query.write.save("output", format="json")
The output is : {"test":"VGhpcyBpcyBhIHRlc3Qh"} when I want it to be: {"test":"This is a test!"}
It seems to work for me...
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext
log = [("2015-12-14 15:27:00","9ab0135f-b8a3-4312-9065-9f8874fd790c","VGhpcyBpcyBhIHRlc3Qh")]
rdd_log = sc.parallelize(log)
df = sqlContext.createDataFrame(rdd_log, ["eventTime", "id", "fullLog"])
df.registerTempTable("data")
query = sqlContext.sql('SELECT unbase64(fullLog) as test FROM data')
query = query.select(query.test.cast("string").alias('test'))
print query.collect()
>> [Row(test=u'This is a test!')]

Categories