import
pyspark
from
pyspark.sql
import
SparkSession
def
create_session():
spk
=
SparkSession.builder \
.master(
"local"
) \
.appName(
"employee_profile.com"
) \
.getOrCreate()
return
spk
def
create_df(spark,data,schema):
df1
=
spark.createDataFrame(data,schema)
return
df1
if
__name__
=
=
"__main__"
:
spark
=
create_session()
input_data
=
[(
1
,
"Shivansh"
,
"Data Scientist"
,
2000000
,
"Noida"
),
(
2
,
"Rishabh"
,
"Software Developer"
,
1500000
,
"Bangalore"
),
(
3
,
"Swati"
,
"Data Analyst"
,
1000000
,
"Hyderabad"
),
(
4
,
"Amar"
,
"Data Analyst"
,
950000
,
"Noida"
),
(
5
,
"Arpit"
,
"Android Developer"
,
1600000
,
"Pune"
),
(
6
,
"Ranjeet"
,
"Python Developer"
,
1800000
,
"Gurugram"
),
(
7
,
"Priyanka"
,
"Full Stack Developer"
,
2200000
,
"Bangalore"
)]
schema
=
[
"Id"
,
"Name"
,
"Job Profile"
,
"Salary"
,
"City"
]
df
=
create_df(spark,input_data,schema)
rdd
=
df.rdd.
map
(
lambda
loop: (
loop[
"Id"
],loop[
"Name"
],loop[
"Salary"
],loop[
"City"
])
)
df2
=
rdd.toDF([
"Id"
,
"Name"
,
"Salary"
,
"City"
])
df2.show()