how to get the column names which have top 3 largest values for each row in Pyspark -
sample dataframe
id a1 a2 a3 a4 a5 a6 0 5 23 4 1 4 5 1 6 43 2 2 98 43 2 3 56 3 1 23 3 3 2 2 6 3 5 2 4 5 6 7 2 7 5
i need this....
top1 top2 top3 a2 a1 a6 a5 a2 a6
hope helps!
from pyspark.sql.functions import col, udf, array, sort_array pyspark.sql.types import stringtype df = sc.parallelize([(0, 5, 23, 4, 1, 4, 5), (1, 6, 43, 2, 2, 98, 43), (2, 3, 56, 3, 1, 23, 3), (3, 2, 2, 6, 3, 5, 2), (4, 5, 6, 7, 2, 7, 5)]).\ todf(["id","a1","a2","a3","a4","a5","a6"]) df_col = df.columns df = df.\ withcolumn("top1_val", sort_array(array([col(x) x in df_col[1:]]), asc=false)[0]).\ withcolumn("top2_val", sort_array(array([col(x) x in df_col[1:]]), asc=false)[1]).\ withcolumn("top3_val", sort_array(array([col(x) x in df_col[1:]]), asc=false)[2]) def modify_values(r, max_col): l = [] in range(len(df_col[1:])): if r[i]== max_col: l.append(df_col[i+1]) return l modify_values_udf = udf(modify_values, stringtype()) df1 = df.\ withcolumn("top1", modify_values_udf(array(df.columns[1:-3]), "top1_val")).\ withcolumn("top2", modify_values_udf(array(df.columns[1:-3]), "top2_val")).\ withcolumn("top3", modify_values_udf(array(df.columns[1:-3]), "top3_val")) df1.show()
output is:
+---+---+---+---+---+---+---+--------+--------+--------+--------+--------+------------+ | id| a1| a2| a3| a4| a5| a6|top1_val|top2_val|top3_val| top1| top2| top3| +---+---+---+---+---+---+---+--------+--------+--------+--------+--------+------------+ | 0| 5| 23| 4| 1| 4| 5| 23| 5| 5| [a2]|[a1, a6]| [a1, a6]| | 1| 6| 43| 2| 2| 98| 43| 98| 43| 43| [a5]|[a2, a6]| [a2, a6]| | 2| 3| 56| 3| 1| 23| 3| 56| 23| 3| [a2]| [a5]|[a1, a3, a6]| | 3| 2| 2| 6| 3| 5| 2| 6| 5| 3| [a3]| [a5]| [a4]| | 4| 5| 6| 7| 2| 7| 5| 7| 7| 6|[a3, a5]|[a3, a5]| [a2]| +---+---+---+---+---+---+---+--------+--------+--------+--------+--------+------------+
Comments
Post a Comment