A short user defined function written in Scala which allows you to transpose a dataframe without performing aggregation functions. To perform a transpose with aggregations, see the pivot
method.
// Import the requisite methods
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{array, col, explode, lit, struct}
// Create a dataframe
val df = spark.createDataFrame(Seq(
(1, 1, 2, 3, 8, 4, 5),
(2, 4, 3, 8, 7, 9, 8),
(3, 6, 1, 9, 2, 3, 6),
(4, 7, 8, 6, 9, 4, 5),
(5, 9, 2, 7, 8, 7, 3),
(6, 1, 1, 4, 2, 8, 4)
)).toDF("uid", "col1", "col2", "col3", "col4", "col5", "col6")
df.show(10,false)
// Create the transpose user defined function.
// Imputs:
// transDF: The dataframe which will be transposed
// transBy: The column that the dataframe will be transposed by
// Outputs:
// Dataframe datatype consisting of three columns:
// transBy
// column_name
// column_value
def transposeUDF(transDF: DataFrame, transBy: Seq[String]): DataFrame = {
val (cols, types) = transDF.dtypes.filter{ case (c, _) => !transBy.contains(c)}.unzip
require(types.distinct.size == 1)
val kvs = explode(array(
cols.map(c => struct(lit(c).alias("column_name"), col(c).alias("column_value"))): _*
))
val byExprs = transBy.map(col(_))
transDF
.select(byExprs :+ kvs.alias("_kvs"): _*)
.select(byExprs ++ Seq($"_kvs.column_name", $"_kvs.column_value"): _*)
}
transposeUDF(df, Seq("uid")).show(12,false)
Output:
df.show(10,false)
+---+----+----+----+----+----+----+
|uid|col1|col2|col3|col4|col5|col6|
+---+----+----+----+----+----+----+
|1 |1 |2 |3 |8 |4 |5 |
|2 |4 |3 |8 |7 |9 |8 |
|3 |6 |1 |9 |2 |3 |6 |
|4 |7 |8 |6 |9 |4 |5 |
|5 |9 |2 |7 |8 |7 |3 |
|6 |1 |1 |4 |2 |8 |4 |
+---+----+----+----+----+----+----+
transposeUDF(df, Seq("uid")).show(12,false)
+---+-----------+------------+
|uid|column_name|column_value|
+---+-----------+------------+
|1 |col1 |1 |
|1 |col2 |2 |
|1 |col3 |3 |
|1 |col4 |8 |
|1 |col5 |4 |
|1 |col6 |5 |
|2 |col1 |4 |
|2 |col2 |3 |
|2 |col3 |8 |
|2 |col4 |7 |
|2 |col5 |9 |
|2 |col6 |8 |
+---+-----------+------------+
only showing top 12 rows