spark, scala

Transpose data with Spark

A short user defined function written in Scala which allows you to transpose a dataframe without performing aggregation functions. To perform a transpose with aggregations, see the pivot method.

// Import the requisite methods
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{array, col, explode, lit, struct}

// Create a dataframe
val df = spark.createDataFrame(Seq(
  (1, 1, 2, 3, 8, 4, 5),
  (2, 4, 3, 8, 7, 9, 8),
  (3, 6, 1, 9, 2, 3, 6),
  (4, 7, 8, 6, 9, 4, 5),
  (5, 9, 2, 7, 8, 7, 3),
  (6, 1, 1, 4, 2, 8, 4)
)).toDF("uid", "col1", "col2", "col3", "col4", "col5", "col6")

df.show(10,false)

// Create the transpose user defined function.
// Imputs:
//   transDF: The dataframe which will be transposed
//   transBy: The column that the dataframe will be transposed by
// Outputs:
//   Dataframe datatype consisting of three columns:
//     transBy
//     column_name
//     column_value
def transposeUDF(transDF: DataFrame, transBy: Seq[String]): DataFrame = {
  val (cols, types) = transDF.dtypes.filter{ case (c, _) => !transBy.contains(c)}.unzip
  require(types.distinct.size == 1)      

  val kvs = explode(array(
    cols.map(c => struct(lit(c).alias("column_name"), col(c).alias("column_value"))): _*
  ))

  val byExprs = transBy.map(col(_))

  transDF
    .select(byExprs :+ kvs.alias("_kvs"): _*)
    .select(byExprs ++ Seq($"_kvs.column_name", $"_kvs.column_value"): _*)
}

transposeUDF(df, Seq("uid")).show(12,false)



Output:

df.show(10,false)
+---+----+----+----+----+----+----+
|uid|col1|col2|col3|col4|col5|col6|
+---+----+----+----+----+----+----+
|1  |1   |2   |3   |8   |4   |5   |
|2  |4   |3   |8   |7   |9   |8   |
|3  |6   |1   |9   |2   |3   |6   |
|4  |7   |8   |6   |9   |4   |5   |
|5  |9   |2   |7   |8   |7   |3   |
|6  |1   |1   |4   |2   |8   |4   |
+---+----+----+----+----+----+----+


transposeUDF(df, Seq("uid")).show(12,false) +---+-----------+------------+ |uid|column_name|column_value| +---+-----------+------------+ |1 |col1 |1 | |1 |col2 |2 | |1 |col3 |3 | |1 |col4 |8 | |1 |col5 |4 | |1 |col6 |5 | |2 |col1 |4 | |2 |col2 |3 | |2 |col3 |8 | |2 |col4 |7 | |2 |col5 |9 | |2 |col6 |8 | +---+-----------+------------+ only showing top 12 rows
Author image

About James Conner

Scuba dive master, wildlife photographer, anthropologist, programmer, electronics tinkerer and big data expert.