pandas_udf operating on two ArrayType(StringType()) fields

Question

I have written a UDF. It is very slow. I would like to replace it with a pandas_udf to take advantage of vectorization.

The actual udf is a bit more complicated, but I have created a simplified toy version of it.

My question: is it possible to replace the UDF in my toy example with a pandas_udf that would take advantage of vectorization? If not, why not?

P.S: I know I could achieve the same effect without a UDF. That is because I simplified the example, but that is not my goal.

from pyspark.sql import functions as f
from pyspark.sql.types import ArrayType, StringType
import pandas as pd

#Example data
df = spark.createDataFrame(pd.DataFrame({ 'Letter': [['A', 'A', 'C'], ['A', 'C', 'A', 'D']],
                                          'Number': [[2, 1, 1], [3, 1, 1, 2]],
                                        })
                          )

# The UDF I hope to replace with a pandas_udf
@f.udf(ArrayType(StringType()))
def array_func(le, nr):
    res=[]
    for i in range(len(nr)):
        if nr[i]==1:
            res.append(le[i])
        else:
            res.append('Nope')
    return res

# Applying the udf
df = df.withColumn('udf', array_func('Letter','Number'))
df.show()

niuer niuer · Accepted Answer · 2019-09-05T18:53:22

How about this?

from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
import pandas as pd

#Example data
df = spark.createDataFrame(pd.DataFrame({ 'Letter': [['A', 'A', 'C'], ['A', 'C', 'A', 'D']],
                                          'Number': [[2, 1, 1], [3, 1, 1, 2]],
                                        })
                          )
df.show()

# Add a dummy column so you can use groupby
df = df.withColumn('id', F.lit(1))
schm = StructType(df.schema.fields + [StructField('udf', ArrayType(StringType()), True)])
@pandas_udf(schm, PandasUDFType.GROUPED_MAP)
def array_udf(pdf):
    res=[]
    for ls, ns in zip(pdf['Letter'], pdf['Number']):
        r = [l if n == 1 else 'Nope' for l, n in zip(ls, ns)]
        res.append(r)
    pdf['udf'] = res
    return pdf

df = df.groupby('id').apply(array_udf).drop('id')
df.show()

The output:

+------------+------------+------------------+
|      Letter|      Number|               udf|
+------------+------------+------------------+
|   [A, A, C]|   [2, 1, 1]|      [Nope, A, C]|
|[A, C, A, D]|[3, 1, 1, 2]|[Nope, C, A, Nope]|
+------------+------------+------------------+

pandas_udf operating on two ArrayType(StringType()) fields

2 Answers