PySpark Error when Using Jellyfish Functions: str argument expected - python

I am working on a task getting the similarity score of the name related data. I am using Spark and jellyfish function in Python. Below is my code in a class:
import jellyfish
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark import SparkContext
df = self.jaro_winkler_func(df, 'df1.first_name', 'df2.first_name')
def jaro_winkler_score(self, s1, s2):
if s1 is None or s2 is None:
out = 0
else:
out = jellyfish.jaro_winkler(s1, s2)
return out
def jaro_winkler_func(self, df, column_left, column_right):
df = df.withColumn('test', self.jaro_winkler_score(df[column_left], df[column_right]))
return df
Below is the error I got:
out = jellyfish.jaro_winkler(s1, s2)
TypeError: str argument expected
I see other related posts in below for same issue but above functions used are already borrowing the answers from these posts.
Creating score column in Pyspark data frame using jellyfish package
Pyspark: How to deal with null values in python user defined functions
I am using Spark 2.3.
Please suggest and thanks in advance.

Related

Unable to create dataframe from RDD

I am trying to create a recommender system from this kaggle dataset: f7a1f242-c
https://www.kaggle.com/kerneler/starter-user-artist-playcount-dataset-f7a1f242-c
the file is called: "user_artist_data_small.txt"
The data looks like this:
1059637 1000010 238
1059637 1000049 1
1059637 1000056 1
1059637 1000062 11
1059637 1000094 1
I'm getting an error on the third last line of code.
!pip install pyspark==3.0.1 py4j==0.10.9
from pyspark.sql import SparkSession
from pyspark import SparkContext
appName="Collaborative Filtering with PySpark"
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,LongType
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from google.colab import drive
drive.mount ('/content/gdrive')
spark = SparkSession.builder.appName(appName).getOrCreate()
sc = spark.sparkContext
userArtistData1=sc.textFile("/content/gdrive/My Drive/data/user_artist_data_small.txt")
schema_user_artist = StructType([StructField("userId",StringType(),True),StructField("artistId",StringType(),True),StructField("playCount",StringType(),True)])
userArtistRDD = userArtistData1.map(lambda k: k.split())
user_artist_df = spark.createDataFrame(userArtistRDD,schema_user_artist,['userId','artistId','playCount'])
ua = user_artist_df.alias('ua')
(training, test) = ua.randomSplit([0.8, 0.2]) #Training the model
als = ALS(maxIter=5, implicitPrefs=True,userCol="userId", itemCol="artistId", ratingCol="playCount",coldStartStrategy="drop")
model = als.fit(training)# predict using the testing datatset
predictions = model.transform(test)
predictions.show()
The error is:
IllegalArgumentException: requirement failed: Column userId must be of type numeric but was actually of type string.
So I change the type from StringType to IntegerType in the schema and I get this error:
TypeError: field userId: IntegerType can not accept object '1059637' in type <class 'str'>
The number happens to be the first item in the dataset. Please help?
Just create a dataframe using the CSV reader (with a space delimiter) instead of creating an RDD:
user_artist_df = spark.read.schema(schema_user_artist).csv('/content/gdrive/My Drive/data/user_artist_data_small.txt', sep=' ')

Dot product in pyspark dataframes with MLLIB

I have a very simple dataframe in pyspark, something like this:
from pyspark.sql import Row
from pyspark.mllib.linalg import DenseVector
row = Row("a", "b")
df = spark.sparkContext.parallelize([
offer_row(DenseVector([1, 1, 1]), DenseVector([1, 0, 0])),
]).toDF()
and I would like to compute the dot product of these vectors without resorting to a UDF call.
The spark MLLIB documentation references a dot method on DenseVectors but if I try to apply this as follows:
df_offers = df_offers.withColumn("c", col("a").dot(col("b")))
I get errors like:
TypeError: 'Column' object is not callable
Does anyone know if these mllib methods are able to be called on DataFrame objects?
Here, you're applying the dot method on a column and not on a DenseVector, which indeed does not work :
df_offers = df_offers.withColumn("c", col("a").dot(col("b")))
You will have to use an udf :
from pyspark.sql.functions import udf, array
from pyspark.sql.types import DoubleType
def dot_fun(array):
return array[0].dot(array[1])
dot_udf = udf(dot_fun, DoubleType())
df_offers = df_offers.withColumn("c", dot_udf(array('a', 'b')))
There are not. You'll have to use an udf:
from pyspark.sql.functions import udf
#udf("double")
def dot(x, y):
if x is not None and y is not None:
return float(x.dot(y))
You can multiply two columns without using UDF by first converting them into BlockMatrix and multiplying them like the example below
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
ac = offer_row.select('a')
bc = offer_row.select('a')
mata = IndexedRowMatrix(ac.rdd.map(lambda row: IndexedRow(*row)))
matb = IndexedRowMatrix(bc.rdd.map(lambda row: IndexedRow(*row)))
ma = mata.toBlockMatrix(100,100)
mb = matb.toBlockMatrix(100,100)
ans = ma.multiply(mb.transpose())
This is a hack but might be more performant than a Python udf. You could just convert the dot product into SQL:
import pandas as pd
from pyspark.sql.functions import expr
coefs = pd.Series({'a': 1.0, 'b': 2.0})
dot_sql = ' + '.join(
'{} * {}'.format(coef, colname)
for colname, coef
in coefs.items()
)
dot_expr = expr(dot_sql)
df.withColumn('dot_product', dot_expr)

pyspark RDD to DataFrame

I am new to Spark.
I have a DataFrame and I used the following command to group it by 'userid'
def test_groupby(df):
return list(df)
high_volumn = self.df.filter(self.df.outmoney >= 1000).rdd.groupBy(
lambda row: row.userid).mapValues(test_groupby)
It gives a RDD which in following structure:
(326033430, [Row(userid=326033430, poiid=u'114233866', _mt_datetime=u'2017-06-01 14:54:48', outmoney=1127.0, partner=2, paytype=u'157', locationcity=u'\u6f4d\u574a', locationprovince=u'\u5c71\u4e1c\u7701', location=None, dt=u'20170601')])
326033430 is the big group.
My question is how can I convert this RDD back to a DataFrame Structure? If I cannot do that, how I can get values from the Row term?
Thank you.
You should just
from pyspark.sql.functions import *
high_volumn = self.df\
.filter(self.df.outmoney >= 1000)\
.groupBy('userid').agg(collect_list('col'))
and in .agg method pass what You want to do with rest of data.
Follow this link : http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.agg

How do I search for a tuple of values in pandas?

I'm trying to write a function to swap a dictionary of targets with results in a pandas dataframe. I'd like to match a tuple of values and swap out new values. I tried building it as follows, but the the row select isn't working. I feel like I'm missing some critical function here.
import pandas
testData=pandas.DataFrame([["Cats","Parrots","Sandstone"],["Dogs","Cockatiels","Marble"]],columns=["Mammals","Birds","Rocks"])
target=("Mammals","Birds")
swapVals={("Cats","Parrots"):("Rats","Canaries")}
for x in swapVals:
#Attempt 1:
#testData.loc[x,target]=swapVals[x]
#Attempt 2:
testData[testData.loc[:,target]==x,target]=swapVals[x]
This was written in Python 2, but the basic idea should work for you. It uses the apply function:
import pandas
testData=pandas.DataFrame([["Cats","Parrots","Sandstone"],["Dogs","Cockatiels","Marble"]],columns=["Mammals","Birds","Rocks"])
swapVals={("Cats","Parrots"):("Rats","Canaries")}
target=["Mammals","Birds"]
def swapper(in_row):
temp =tuple(in_row.values)
if temp in swapVals:
return list(swapVals[temp])
else:
return in_row
testData[target] = testData[target].apply(swapper, axis=1)
testData
Note that if you loaded the other keys into the dict, you could do the apply without the swapper function:
import pandas
testData=pandas.DataFrame([["Cats","Parrots","Sandstone"],["Dogs","Cockatiels","Marble"]],columns=["Mammals","Birds","Rocks"])
swapVals={("Cats","Parrots"):("Rats","Canaries"), ("Dogs","Cockatiels"):("Dogs","Cockatiels")}
target=["Mammals","Birds"]
testData[target] = testData[target].apply(lambda x: list(swapVals[tuple(x.values)]), axis=1)
testData

cant perform 2 succesive groupBy in spark

I am working with Spark on python.
My problem is: i have a .csv file which contains some data (int1, int2, int3, date). I did a groupByKey on int1. Now I want to perform an other groupBy on my date with the rdd create by the first groupBy.
Problem is I can't perform it. Any idea?
Regards
EDIT2:
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
csvById12Rdd=file.map(lambda (id1,id2,value): ((id1,id2),value)).groupByKey()
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value in values:
print("Id1:{} Id2:{} Value:{}".format(id1,id2,value))
csvById12Rdd.first().foreach(printit)
the csv is like
31705,48,2,2014-10-28T18:14:09.000Z
EDIT 3:
i can print my iterator data with this code
from pyspark import SparkContext
import csv
import sys
import StringIO
sc = SparkContext("local", "Simple App")
file = sc.textFile("histories_2week9.csv")
def go_in_rdd2(x):
print x[0]
for i in x[1]:
print i
counts = file.map(lambda line: (line.split(",")[0],line.split(",")[1:]))
counts = counts.groupByKey()
counts.foreach(go_in_rdd2)
but i still cant groupBy
Group by return an RDD of (Key, Iterable[Value]), can you do the otherway round?
Group by id1 and id2 and get an RDD of ((Id1,Id2), Iterable[Value])
Then group by id1 alone and get an RDD of (Id1, Iterable[(Id2,Iterable[Value])])
Something like:
csv=[(1,1,"One","Un"),(1,2,"Two","Deux"),(2,1,"Three","Trois"),(2,1,"Four","Quatre")]
csvRdd=sc.parallelize(csv)
# Step 1
csvById12Rdd=csvRdd.map(lambda (id1,id2,value1,value2): ((id1,id2),(value1,value2))).groupByKey()
# Step 2
csvById1Rdd=csvById12Rdd.map(lambda ((id1,id2),group):(id1, (id2,group))).groupByKey()
# Print
def printit(one):
id1, twos=one
print("Id1:{}".format(id1))
for two in twos:
id2, values=two
print("Id1:{} Id2:{}".format(id1,id2))
for value1,value2 in values:
print("Id1:{} Id2:{} Values:{} {}".format(id1,id2,value1,value2))
csvById1Rdd.foreach(printit)

Categories