2
votes

I have a pyspark Dataframe.

Example:

ID   |    phone   |  name <array>  | age <array>
-------------------------------------------------
12   | 827556     | ['AB','AA']    |  ['CC']
-------------------------------------------------
45   |  87346     |  null          |   ['DD']
-------------------------------------------------
56   |  98356     |  ['FF']        |  null
-------------------------------------------------
34   |  87345     |   ['AA','BB']  |  ['BB']

I want to concatenate the 2 arrays name and age. I did it like this:

df = df.withColumn("new_column", F.concat(df.name, df.age))
df = df.select("ID", "phone", "new_column")

But I got some missing columns, it seems the concat function works on a String not on an array and remove the duplicates:

Result expected:

ID   |    phone   |  new_column <array>  
----------------------------------------
12   | 827556     | ['AB','AA','CC']    
----------------------------------------
45   |  87346     |  ['DD']             
----------------------------------------
56   |  98356     |  ['FF']        
----------------------------------------
34   |  87345     |   ['AA','BB']    
----------------------------------------

How can I concatenate 2 arrays in pyspark knowing that I'm using Spark version < 2.4

Thank you

4

4 Answers

3
votes

You could use selectExpr as well.

testdata = [(0, ['AB','AA'],  ['CC']), (1, None, ['DD']), (2,  ['FF'] ,None), (3,  ['AA','BB'] , ['BB'])]
df = spark.createDataFrame(testdata, ['id', 'name', 'age'])

>>> df.show()
+---+--------+----+
| id|    name| age|
+---+--------+----+
|  0|[AB, AA]|[CC]|
|  1|    null|[DD]|
|  2|    [FF]|null|
|  3|[AA, BB]|[BB]|
+---+--------+----+

>>> df.selectExpr('''array(concat_ws(',',name,age)) as joined''').show()
+----------+
|    joined|
+----------+
|[AB,AA,CC]|
|      [DD]|
|      [FF]|
|[AA,BB,BB]|
+----------+
2
votes

For spark < 2.4, we need an udf to concat the array. Hope this helps.

from pyspark.sql import functions as F
from pyspark.sql.types import *

df = spark.createDataFrame([('a',['AA','AB'],['BC']),('b',None,['CB']),('c',['AB','BA'],None),('d',['AB','BB'],['BB'])],['c1','c2','c3'])
df.show()
+---+--------+----+
| c1| c2     | c3 |
+---+--------+----+
| a|[AA, AB] |[BC]|
| b| null    |[CB]|
| c|[AB, BA] |null|
| d|[AB, BB] |[BB]|
+---+--------+----+

## changing null to empty array

df = df.withColumn('c2',F.coalesce(df.c2,F.array())).withColumn('c3',F.coalesce(df.c3,F.array()))
df.show()
+---+--------+----+
| c1| c2     | c3 |
+---+--------+----+
| a|[AA, AB] |[BC]|
| b| []      |[CB]|
| c|[AB, BA] | [] |
| d|[AB, BB] |[BB]|
+---+--------+----+

## UDF to concat the columns and remove the duplicates

udf1 = F.udf(lambda x,y: list(dict.fromkeys(x+y)), ArrayType(StringType()))
df = df.withColumn('concatd',udf1(df.c2,df.c3))
df.show()
+---+--------+----+------------+
| c1| c2     | c3 | concatd    |
+---+--------+----+------------+
| a|[AA, AB] |[BC]|[AA, AB, BC]|
| b| []      |[CB]| [CB]       |
| c|[AB, BA] | [] | [AB, BA]   |
| d|[AB, BB] |[BB]| [AB, BB]   |
+---+--------+----+------------+
1
votes

Will this help:

from pyspark.sql.functions import col, concat 
testdata = [(0, ['a','b','d'], ['a2','b2','d2']), (1, ['c'], ['c2']), (2, ['d','e'],['d2','e2'])]
df = spark.createDataFrame(testdata, ['id', 'codes', 'codes2'])

df2 = df.withColumn("new_column",concat(col("codes"), col("codes2")))

After concatenate, the result is :

+---+---------+------------+--------------------+ 
| id| codes   | codes2     | new_column         | 
+---+---------+------------+--------------------+ 
| 0 |[a, b, d]|[a2, b2, d2]|[a, b, d, a2, b2,...| 
| 1 |[c]      |[c2]        |[c, c2]             | 
| 2 |[d, e]   |[d2, e2]    |[d, e, d2, e2]      | 
+---+---------+------------+--------------------+

Regards

1
votes

A Spark solution (Spark < 2.4) without using UDF as below

import pyspark.sql.functions as F
testdata = [(0, ['AB','AA'],  ['CC']), (1, None, ['DD']), (2,  ['FF'] ,None), (3,  ['AA','BB'] , ['BB'])]
df = spark.createDataFrame(testdata, ['id', 'name', 'age'])
df.show()

+---+--------+----+
| id|    name| age|
+---+--------+----+
|  0|[AB, AA]|[CC]|
|  1|    null|[DD]|
|  2|    [FF]|null|
|  3|[AA, BB]|[BB]|
+---+--------+----+

df = df.withColumn('name', F.concat_ws(',', 'name'))
df = df.withColumn('age', F.concat_ws(',', 'age'))
df = df.withColumn("new_column",F.concat_ws(',', df.name, df.age))
df = df.withColumn("new_column",F.regexp_replace(df.new_column, "^,", ''))
df = df.withColumn("new_column",F.regexp_replace(df.new_column, "\,$", ''))
df.withColumn("new_column",F.split(df.new_column, ",")).show(5, False)

+---+-----+---+------------+
|id |name |age|new_column  |
+---+-----+---+------------+
|0  |AB,AA|CC |[AB, AA, CC]|
|1  |     |DD |[DD]        |
|2  |FF   |   |[FF]        |
|3  |AA,BB|BB |[AA, BB, BB]|
+---+-----+---+------------+