Check if Input dataset contains a key or not in PySpark - python

I have the following code as shown below. I need to check if the column y.lc.eoouh.ci is present in the input source and populate the column only if present, else it should be NULL.(The key lc is also optional)
The code below doesn't seem to work the way it is supposed to as even though y.lc.eoouch.ci is present in the input, it evaluates to NULL.
The has_column implementation is from here.
df = df_s_a \
.withColumn("ceci", \
udf(
lambda y : y.lc[-1].eoouh.ci \
if has_column(y, 'lc.eoouh.ci') \
else None, \
StringType()
)(col('eh') \
) \
) \
.select(
col('ceci')
)
df.show()
Sample input:
{
eh: {
lc: [
eoouch: {
ci: "1234ABC"
}
]
}
}

The df[something.path.somewhere] doesn't work. I'll have to investigate that option a bit.
I've managed to make it work like this:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
def has_column(df):
try:
df["lc"][0]["eoouch"]["ci"]
return True
except KeyError:
return False
if __name__ == "__main__":
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
data = [
{"eh": {"lc": [{"eoouch": {"ci": "test"}}]}},
{"eh": {"lc": [{"eoouch": {"as": "test"}}]}},
]
df = spark.createDataFrame(data)
add_column_udf = F.udf(
lambda y: y if has_column(y) else None,
StringType(),
)
df = df.withColumn("ceci", add_column_udf(F.col("eh")))
Result:
+----------------------------------+-------------------------+
|eh |ceci |
+----------------------------------+-------------------------+
|{lc -> [{eoouch -> {ci -> test}}]}|{lc=[{eoouch={ci=test}}]}|
|{lc -> [{eoouch -> {as -> test}}]}|null |
+----------------------------------+-------------------------+
It's not perfect since it's not a general solution for column name but it could be easily generalized since it works on a dict object.

Related

pyspark: heavy initialization with mapPartition

I want to partition a dataframe and iterate over each partition with some initialization in each iteration
data = [("2022-12-22",'d1',2,{'u1':{'sn':['s1','s2'],'fs':200}}),
("2022-12-22",'d2',1,{'u2':{'sn':['s2'],'fs':150},
'u3':{'sn':['s1'],'fs':50}}),
("2022-12-23",'d1',20,{'u1':{'sn':['s1','s2'],'fs':2000}}),
("2022-12-23",'d2',1,{'u2':{'sn':['s2'],'fs':1500},
'u3':{'sn':['s1'],'fs':500}})
]
usernode_schema = StructType([
StructField("sn",ArrayType(StringType())),
StructField("fs",LongType()),
])
userNodeType_schema = MapType(StringType(),usernode_schema)
schema = StructType([
StructField("date",StringType()),
StructField("id",StringType()),
StructField("rank",IntegerType()),
StructField("umap",userNodeType_schema)
])
df = spark.createDataFrame(data=data, schema = schema)
df.show(truncate=False)
def f_fm_row(x,sn_d,i):
sn_d['s1'] = sn_d.get('s1',0)+i
return {"date":x["date"], "id":x["id"],"rank":x["rank"],"s1":sn_d['s1']}
fm_row_schema = StructType([
StructField("date",StringType()),
StructField("id",StringType()),
StructField("rank",IntegerType()),
StructField("s1",IntegerType())
])
def f_map_par(par_df):
for i in range(0,2):
sn_dict ={}
par_df = map(lambda row: f_fm_row(row,sn_dict,i),par_df)
return par_df
#map partitions
(df
.repartition(col("date"))
.sortWithinPartitions(col("rank"))
.rdd
.mapPartitions(f_map_par)
).toDF(fm_row_schema).show()
But sn_dict ={} is not happening in each iteration. It's happens once only. How can I make sure that this initialization happens in each iteration?

Databricks - seprating columns + using schema

I wrote a set of code to create a chart, I am trying to show this chart in 2 ways; with and without the schema function. When I have my code without schema, my problem is that the columns are all shown in one column, and I don't know how to separate them:
# File location and type
file_location = "/databricks-datasets/cs110x/ml-20m/data-001/movies.csv.gz"
file_type = "csv"
# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"
df_movies = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
print(df_movies.count())
display(df_movies)
Result:
And when I add schema, columns get separated but I get null values:
from pyspark.sql.types import *
movies_schema = StructType([
StructField('movieId', IntegerType()),
StructField('title', StringType()),
StructField('genres', StringType())
])
# File location and type
file_location = "/databricks-datasets/cs110x/ml-20m/data-001/movies.csv.gz"
file_type = "csv"
# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"
df_movies = spark.read.format(file_type) \
.schema(movies_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
print(df_movies.count())
display(df_movies)
Result:
I appreciate it if you could tell me how I can separate columns in the first section and how I can solve the problem with the null values in the second part.

pyspark.sql.utils.AnalysisException: THEN and ELSE expressions should all be same type or coercible to a common type

I have the following code:
import pyspark.sql.functions as func
def get_alert(bid):
# for simplicity I only provide "return" part
return "1", "2"
get_alert_udf = func.udf(lambda bid:
get_alert(bid),
StructType(
[
StructField('prob', StringType()),
StructField('level', StringType())
]
)
)
df = df \
.withColumn("val", func.when(func.col("is_inside") == 1,
get_alert_udf(
func.col("building_id")
))
.otherwise(func.struct(func.lit("0"),func.lit("0")))
When I execute this code, I get the following error:
pyspark.sql.utils.AnalysisException:
u"cannot resolve
'CASE WHEN (`is_inside` = 1)
THEN <lambda>(building_id) ELSE named_struct('col1', '0', 'col2', '0') END' due to data type mismatch:
THEN and ELSE expressions should all be same type or coercible to a common type
In my case the outputs seem to have the same type in case of THEN and ELSE. I don't understand where is the difference between:
StructType(
[
StructField('prob', StringType()),
StructField('level', StringType())
]
)
and
func.struct(func.lit("0"),func.lit("0"))
The function you use returns a named struct. This means that both names and types have to match:
func.when(
func.col("is_inside") == 1,
get_alert_udf(func.col("building_id"))
).otherwise(
func.struct(func.lit("0").alias("prob"), func.lit("0").alias("level"))
)
or
schema = StructType([
StructField('prob', StringType()), StructField('level', StringType())
])
get_alert_udf = func.udf(get_alert, schema)
and then
func.when(
func.col("is_inside") == 1,
get_alert_udf(func.col("building_id"))
).otherwise(func.struct(func.lit("0"), func.lit("0")).cast(schema))

dataframe object is not callable in pyspark

temp = Window.partitionBy("id").orderBy("time").rowsBetween(-5, 5)
spark_df.withColumn("movingAvg",fn.avgspark_df("average")).over(temp)).show()
I'm getting this error in the last line .
dataframe object is not callable
You are missing a bracket, but it also seems some of the syntax is wrong. I assume this is what your code was before the bracket got missing:
fn.avgspark_df("average")
Which is why you get the error; you are trying to call the DataFrame as a function. I believe you can achieve what you want with:
import pyspark.sql.functions as fn
from pyspark.sql import Window
df = pd.DataFrame({'id': [0,0,0,0,0,1,1,1,1,1],
'time': [1,2,3,4,5,1,2,3,4,5],
'average':[0,1,2,3,4,5,6,7,8,9] })
df = sqlContext.createDataFrame(df)
temp = Window.partitionBy("id").orderBy("time").rowsBetween(-1, 1)
df.withColumn("movingAvg",fn.avg("average").over(temp)).show()
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import max,min,avg
spark = SparkSession.builder.appName("Data Frame Example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
l=[("Alice", "2016-05-01", 50.00),
("Alice", "2016-05-03", 45.00),
("Alice", "2016-05-04", 55.00),
("Bob", "2016-05-01", 25.00),
("Bob", "2016-05-04", 29.00),
("Bob", "2016-05-06", 27.00)]
customers = spark.sparkContext.parallelize(l).toDF(["name", "date", "amountSpent"])
temp = Window.partitionBy("name").orderBy("date")
customers.withColumn( "movingAvg",avg("amountSpent").over(temp)).show()

What is the best way to remove accents with Apache Spark dataframes in PySpark?

I need to delete accents from characters in Spanish and others languages from different datasets.
I already did a function based in the code provided in this post that removes special the accents. The problem is that the function is slow because it uses an UDF.
I'm just wondering if I can improve the performance of my function to get results in less time, because this is good for small dataframes but not for big ones.
Thanks in advance.
Here the code, you will be able to run it as it is presented:
# Importing sql types
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf, col
import unicodedata
# Building a simple dataframe:
schema = StructType([StructField("city", StringType(), True),
StructField("country", StringType(), True),
StructField("population", IntegerType(), True)])
countries = ['Venezuela', 'US#A', 'Brazil', 'Spain']
cities = ['Maracaibó', 'New York', ' São Paulo ', '~Madrid']
population = [37800000,19795791,12341418,6489162]
# Dataframe:
df = sqlContext.createDataFrame(list(zip(cities, countries, population)), schema=schema)
df.show()
class Test():
def __init__(self, df):
self.df = df
def clearAccents(self, columns):
"""This function deletes accents in strings column dataFrames,
it does not eliminate main characters, but only deletes special tildes.
:param columns String or a list of column names.
"""
# Filters all string columns in dataFrame
validCols = [c for (c, t) in filter(lambda t: t[1] == 'string', self.df.dtypes)]
# If None or [] is provided with column parameter:
if (columns == "*"): columns = validCols[:]
# Receives a string as an argument
def remove_accents(inputStr):
# first, normalize strings:
nfkdStr = unicodedata.normalize('NFKD', inputStr)
# Keep chars that has no other char combined (i.e. accents chars)
withOutAccents = u"".join([c for c in nfkdStr if not unicodedata.combining(c)])
return withOutAccents
function = udf(lambda x: remove_accents(x) if x != None else x, StringType())
exprs = [function(col(c)).alias(c) if (c in columns) and (c in validCols) else c for c in self.df.columns]
self.df = self.df.select(*exprs)
foo = Test(df)
foo.clearAccents(columns="*")
foo.df.show()
One possible improvement is to build a custom Transformer, which will handle Unicode normalization, and corresponding Python wrapper. It should reduce overall overhead of passing data between JVM and Python and doesn't require any modifications in Spark itself or access to private API.
On JVM side you'll need a transformer similar to this one:
package net.zero323.spark.ml.feature
import java.text.Normalizer
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{DataType, StringType}
class UnicodeNormalizer (override val uid: String)
extends UnaryTransformer[String, String, UnicodeNormalizer] {
def this() = this(Identifiable.randomUID("unicode_normalizer"))
private val forms = Map(
"NFC" -> Normalizer.Form.NFC, "NFD" -> Normalizer.Form.NFD,
"NFKC" -> Normalizer.Form.NFKC, "NFKD" -> Normalizer.Form.NFKD
)
val form: Param[String] = new Param(this, "form", "unicode form (one of NFC, NFD, NFKC, NFKD)",
ParamValidators.inArray(forms.keys.toArray))
def setN(value: String): this.type = set(form, value)
def getForm: String = $(form)
setDefault(form -> "NFKD")
override protected def createTransformFunc: String => String = {
val normalizerForm = forms($(form))
(s: String) => Normalizer.normalize(s, normalizerForm)
}
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == StringType, s"Input type must be string type but got $inputType.")
}
override protected def outputDataType: DataType = StringType
}
Corresponding build definition (adjust Spark and Scala versions to match your Spark deployment):
name := "unicode-normalization"
version := "1.0"
crossScalaVersions := Seq("2.11.12", "2.12.8")
organization := "net.zero323"
val sparkVersion = "2.4.0"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-mllib" % sparkVersion
)
On Python side you'll need a wrapper similar to this one.
from pyspark.ml.param.shared import *
# from pyspark.ml.util import keyword_only # in Spark < 2.0
from pyspark import keyword_only
from pyspark.ml.wrapper import JavaTransformer
class UnicodeNormalizer(JavaTransformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, form="NFKD", inputCol=None, outputCol=None):
super(UnicodeNormalizer, self).__init__()
self._java_obj = self._new_java_obj(
"net.zero323.spark.ml.feature.UnicodeNormalizer", self.uid)
self.form = Param(self, "form",
"unicode form (one of NFC, NFD, NFKC, NFKD)")
# kwargs = self.__init__._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, form="NFKD", inputCol=None, outputCol=None):
# kwargs = self.setParams._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
return self._set(**kwargs)
def setForm(self, value):
return self._set(form=value)
def getForm(self):
return self.getOrDefault(self.form)
Build Scala package:
sbt +package
include it when you start shell or submit. For example for Spark build with Scala 2.11:
bin/pyspark --jars path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar \
--driver-class-path path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar
and you should be ready to go. All what is left is a little bit of regexp magic:
from pyspark.sql.functions import regexp_replace
normalizer = UnicodeNormalizer(form="NFKD",
inputCol="text", outputCol="text_normalized")
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid")
]).toDF(["id", "text"])
(normalizer
.transform(df)
.select(regexp_replace("text_normalized", "\p{M}", ""))
.show())
## +--------------------------------------+
## |regexp_replace(text_normalized,\p{M},)|
## +--------------------------------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## +--------------------------------------+
Please note that this follows the same conventions as built in text transformers and is not null safe. You can easily correct for that by check for null in createTransformFunc.
Another way for doing using python Unicode Database :
import unicodedata
import sys
from pyspark.sql.functions import translate, regexp_replace
def make_trans():
matching_string = ""
replace_string = ""
for i in range(ord(" "), sys.maxunicode):
name = unicodedata.name(chr(i), "")
if "WITH" in name:
try:
base = unicodedata.lookup(name.split(" WITH")[0])
matching_string += chr(i)
replace_string += base
except KeyError:
pass
return matching_string, replace_string
def clean_text(c):
matching_string, replace_string = make_trans()
return translate(
regexp_replace(c, "\p{M}", ""),
matching_string, replace_string
).alias(c)
So now let's test it :
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid"),
(5, "São Paulo"), (6, "Maracaibó")
]).toDF(["id", "text"])
df.select(clean_text("text")).show()
## +---------------+
## | text|
## +---------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## | Sao Paulo|
## | Maracaibo|
## +---------------+
acknowledge #zero323
This solution is Python only, but is only useful if the number of possible accents is low (e.g. one single language like Spanish) and the character replacements are manually specified.
There seems to be no built-in way to do what you asked for directly without UDFs, however you can chain many regexp_replace calls to replace each possible accented character. I tested the performance of this solution and it turns out that it only runs faster if you have a very limited set of accents to replace. If that's the case it can be faster than UDFs because it is optimized outside of Python.
from pyspark.sql.functions import col, regexp_replace
accent_replacements_spanish = [
(u'á', 'a'), (u'Á', 'A'),
(u'é', 'e'), (u'É', 'E'),
(u'í', 'i'), (u'Í', 'I'),
(u'ò', 'o'), (u'Ó', 'O'),
(u'ú|ü', 'u'), (u'Ú|Ű', 'U'),
(u'ñ', 'n'),
# see http://stackoverflow.com/a/18123985/3810493 for other characters
# this will convert other non ASCII characters to a question mark:
('[^\x00-\x7F]', '?')
]
def remove_accents(column):
r = col(column)
for a, b in accent_replacements_spanish:
r = regexp_replace(r, a, b)
return r.alias('remove_accents(' + column + ')')
df = sqlContext.createDataFrame([['Olà'], ['Olé'], ['Núñez']], ['str'])
df.select(remove_accents('str')).show()
I haven't compared the performance with the other responses and this function is not as general, but it is at least worth considering because you don't need to add Scala or Java to your build process.
Here's my implementation.
Apart from accents I also remove speciach characters. Because I needed to pivot and save a table, and you can't save a table with column name that has " ,;{}()\n\t=\/" characters.
import re
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from unidecode import unidecode
spark = SparkSession.builder.getOrCreate()
data = [(1, " \\ / \\ {____} aŠdá_ \t = \n () asd ____aa 2134_ 23_"), (1, "N"), (2, "false"), (2, "1"), (3, "NULL"),
(3, None)]
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
df.show()
for col_name in ["txt"]:
tmp_dict = {}
for col_value in [row[0] for row in df.select(col_name).distinct().toLocalIterator()
if row[0] is not None]:
new_col_value = re.sub("[ ,;{}()\\n\\t=\\\/]", "_", col_value)
new_col_value = re.sub('_+', '_', new_col_value)
if new_col_value.startswith("_"):
new_col_value = new_col_value[1:]
if new_col_value.endswith("_"):
new_col_value = new_col_value[:-1]
new_col_value = unidecode(new_col_value)
tmp_dict[col_value] = new_col_value.lower()
df = df.na.replace(to_replace=tmp_dict, subset=[col_name])
df.show()
if you can't access external librares (like me) you can replace unidecode with
new_col_value = new_col_value.translate(str.maketrans(
"ä,ö,ü,ẞ,á,ä,č,ď,é,ě,í,ĺ,ľ,ň,ó,ô,ŕ,š,ť,ú,ů,ý,ž,Ä,Ö,Ü,ẞ,Á,Ä,Č,Ď,É,Ě,Í,Ĺ,Ľ,Ň,Ó,Ô,Ŕ,Š,Ť,Ú,Ů,Ý,Ž",
"a,o,u,s,a,a,c,d,e,e,i,l,l,n,o,o,r,s,t,u,u,y,z,A,O,U,S,A,A,C,D,E,E,I,L,L,N,O,O,R,S,T,U,U,Y,Z"))

Categories