Convert CSV file to Json using pyspark - apache-spark

I am facing issue while converting .CSV file to .json file using pyspark.
MyCode
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_pubs = spark.read.options(inferSchema='True',delimiter=',').csv("filename.csv")
#df_pubs_json = df_pubs.write.mode(SaveMode.Overwrite).json("filename.json")
#df_pubs_json = df_pubs.toJSON("filename.json")\
df_pubs.write.format("json").mode("overwrite").save("filename.json")
Error
py4j.protocol.Py4JJavaError: An error occurred while calling o31.save.
: ExitCodeException exitCode=-1073741515:

df = spark.read.options(header=True,inferSchema=True,delimiter=",").csv("filename.tsv")
df1 = df.toJSON()
df_final = spark.read.json(df1)
df_final.toPandas().to_json("filename.json",orient='records')

Related

getting error while trying to read athena table in spark

I have the following code snippet in pyspark:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.context import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
import pyspark.sql.dataframe
def validate_data():
conf = SparkConf().setAppName("app")
spark = SparkContext(conf=conf)
config = {
"val_path" : "s3://forecasting/data/validation.csv"
}
data1_df = spark.read.table("db1.data_dest”)
data2_df = spark.read.table("db2.data_source”)
print(data1_df.count())
print(data2_df.count())
if __name__ == "__main__":
validate_data()
Now this code works fine when run on jupyter notebook on sagemaker ( connecting to EMR )
but when we are running as a python script on terminal, its throwing this error
Error message
AttributeError: 'SparkContext' object has no attribute 'read'
We have to automate these notebooks, so we are trying to convert them to python scripts
You can only call read on a Spark Session, not on a Spark Context.
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("app")
spark = SparkSession.builder.config(conf=conf)
Or you can convert the Spark context to a Spark session
conf = SparkConf().setAppName("app")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

Fetching data from REST API to Spark Dataframe using Pyspark

i am building a datapipeline which consume data from RESTApi in json format and pushed to Spark Dataframe. Spark Version: 2.4.4
but getting error as
df = SQLContext.jsonRDD(rdd)
AttributeError: type object 'SQLContext' has no attribute 'jsonRDD'
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
from pyspark import SQLContext
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = spark.sparkContext.parallelize(mylist)
df = SQLContext.jsonRDD(rdd)
return df
url = "https://mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
if there is any other better way to query RestApi and bring data to Spark Dataframe using Pyspark.
I am not sure if i am missing something.
Check Spark Rest API Data source. One advantage with this library is it will use multiple executors to fetch data rest api & create data frame for you.
In your code, you are fetching all data into the driver & creating DataFrame, It might fail with heap space if you have very huge data.
url = "https://mylink"
options = { 'url' : url, 'method' : 'GET', 'readTimeout' : '10000', 'connectionTimeout' : '2000', 'partitions' : '10'}
# Now we create the Dataframe which contains the result from the call to the API
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()

Calling JSON data from Restapi in pyspark throwing Error

i am trying query REST API to get a data to dataframe using pyspark.
but it is throwing error as
File "C:/Users/QueryRestapi.py", line 30, in <module>
df = parse_dataframe(json_data)
File "C:/Users/QueryRestapi.py", line 22, in parse_dataframe
rdd = SparkContext.parallelize(mylist)
TypeError: unbound method parallelize() must be called with SparkContext instance as first argument (got list instance instead)
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = SparkContext.parallelize(mylist)
df = sqlContext.jsonRDD(rdd)
return df
url = "https://"mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
Please help me, if i am missing something.......Thanks a lot

Error while using dataframe show method in pyspark

I am trying to read data from BigQuery using pandas and pyspark. I am able to get the data but somehow getting below error while converting it into Spark DataFrame.
py4j.protocol.Py4JJavaError: An error occurred while calling o28.showString.
: java.lang.IllegalStateException: Could not find TLS ALPN provider; no working netty-tcnative, Conscrypt, or Jetty NPN/ALPN available
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.defaultSslProvider(GrpcSslContexts.java:258)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.configure(GrpcSslContexts.java:171)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.forClient(GrpcSslContexts.java:120)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.buildTransportFactory(NettyChannelBuilder.java:401)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.AbstractManagedChannelImplBuilder.build(AbstractManagedChannelImplBuilder.java:444)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createSingleChannel(InstantiatingGrpcChannelProvider.java:223)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createChannel(InstantiatingGrpcChannelProvider.java:169)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.getTransportChannel(InstantiatingGrpcChannelProvider.java:156)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.ClientContext.create(ClientContext.java:157)
Following is the environment detail
Python version : 3.7
Spark version : 2.4.3
Java version : 1.8
The code is as follow
import google.auth
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession , SQLContext
from google.cloud import bigquery
# Currently this only supports queries which have at least 10 MB of results
QUERY = """ SELECT * FROM test limit 1 """
#spark = SparkSession.builder.appName('Query Results').getOrCreate()
sc = pyspark.SparkContext()
bq = bigquery.Client()
print('Querying BigQuery')
project_id = ''
query_job = bq.query(QUERY,project=project_id)
# Wait for query execution
query_job.result()
df = SQLContext(sc).read.format('bigquery') \
.option('dataset', query_job.destination.dataset_id) \
.option('table', query_job.destination.table_id)\
.option("type", "direct")\
.load()
df.show()
I am looking some help to solve this issue.
I managed to find the better solution referencing this link , below is my working code :
Install pandas_gbq package in python library before writing below code .
import pandas_gbq
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
project_id = "<your-project-id>"
query = """ SELECT * from testSchema.testTable"""
athletes = pandas_gbq.read_gbq(query=query, project_id=project_id,dialect = 'standard')
# Get a reference to the Spark Session
sc = SparkContext()
spark = SparkSession(sc)
# convert from Pandas to Spark
sparkDF = spark.createDataFrame(athletes)
# perform an operation on the DataFrame
print(sparkDF.count())
sparkDF.show()
Hope it helps to someone ! Keep pysparking :)

AttributeError: 'StructField' object has no attribute '_get_object_id': with loading parquet file with custom schema

I am trying to read group of parquet files using PySpark using custom schema but it gives AttributeError: 'StructField' object has no attribute '_get_object_id' error.
Here is my sample code:
import pyspark
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import Row
import pyspark.sql.functions as func
from pyspark.sql.types import *
sc = pyspark.SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
l = [('1',31200,'Execute',140,'ABC'),('2',31201,'Execute',140,'ABC'),('3',31202,'Execute',142,'ABC'),
('4',31103,'Execute',149,'DEF'),('5',31204,'Execute',145,'DEF'),('6',31205,'Execute',149,'DEF')]
rdd = sc.parallelize(l)
trades = rdd.map(lambda x: Row(global_order_id=int(x[0]), nanos=int(x[1]),message_type=x[2], price=int(x[3]),symbol=x[4]))
trades_df = sqlContext.createDataFrame(trades)
trades_df.printSchema()
trades_df.write.parquet('trades_parquet')
trades_df_Parquet = sqlContext.read.parquet('trades_parquet')
trades_df_Parquet.printSchema()
# The schema is encoded in a string.
schemaString = "global_order_id message_type nanos price symbol"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
trades_df_Parquet_n = spark.read.format('parquet').load('trades_parquet',schema,inferSchema =False)
#trades_df_Parquet_n = spark.read.parquet('trades_parquet',schema)
trades_df_Parquet_n.printSchema()
Can any one please help me with your suggestion.
Specify the name of the option schema so it knows it's not format:
Signature: trades_df_Parquet_n.load(path=None, format=None, schema=None, **options)
You get:
trades_df_Parquet_n = spark.read.format('parquet').load('trades_parquet',schema=schema, inferSchema=False)

Resources