from
pyspark.sql
import
SparkSession
from
pyspark.sql.functions
import
col,lit,create_map
spark_session
=
SparkSession.builder.getOrCreate()
emp
=
[(
1
,
"Smith"
,
-
1
,
"2018"
,
"10"
,
"M"
,
3000
),
(
2
,
"Rose"
,
1
,
"2010"
,
"20"
,
"M"
,
4000
),
(
3
,
"Williams"
,
1
,
"2010"
,
"10"
,
"M"
,
1000
),
(
4
,
"Jones"
,
2
,
"2005"
,
"10"
,
"F"
,
2000
),
(
5
,
"Brown"
,
2
,
"2010"
,
"40"
,
"F"
,
4000
),
(
6
,
"Brown"
,
2
,
"2010"
,
"50"
,
"M"
,
2000
) ]
empColumns
=
[
"emp_id"
,
"name"
,
"superior_emp_id"
,
"year_joined"
,
"emp_dept_id"
,
"gender"
,
"salary"
]
empDF
=
spark_session.createDataFrame(data
=
emp,
schema
=
empColumns)
empDF
=
empDF.withColumn(
"employee_details"
,
create_map(lit(
"name"
),
col(
"name"
),
lit(
"superior_emp_id"
),
col(
"superior_emp_id"
),
lit(
"year_joined"
),
col(
"year_joined"
),
lit(
"emp_dept_id"
),
col(
"emp_dept_id"
),
lit(
"gender"
),
col(
"gender"
),
lit(
"salary"
),
col(
"salary"
))).drop(
"name"
,
"superior_emp_id"
,
"year_joined"
,
"emp_dept_id"
,
"gender"
,
"salary"
)
empDF.show(truncate
=
False
)