Convert CSV file to Json using pyspark

Convert CSV file to Json using pyspark - apache-spark

I am facing issue while converting .CSV file to .json file using pyspark.
MyCode
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_pubs = spark.read.options(inferSchema='True',delimiter=',').csv("filename.csv")
#df_pubs_json = df_pubs.write.mode(SaveMode.Overwrite).json("filename.json")
#df_pubs_json = df_pubs.toJSON("filename.json")\
df_pubs.write.format("json").mode("overwrite").save("filename.json")
Error
py4j.protocol.Py4JJavaError: An error occurred while calling o31.save.
: ExitCodeException exitCode=-1073741515:

df = spark.read.options(header=True,inferSchema=True,delimiter=",").csv("filename.tsv")
df1 = df.toJSON()
df_final = spark.read.json(df1)
df_final.toPandas().to_json("filename.json",orient='records')

Related

getting error while trying to read athena table in spark

I have the following code snippet in pyspark:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.context import SparkContext
from pyspark.sql import Row, SQLContext, SparkSession
import pyspark.sql.dataframe
def validate_data():
conf = SparkConf().setAppName("app")
spark = SparkContext(conf=conf)
config = {
"val_path" : "s3://forecasting/data/validation.csv"
}
data1_df = spark.read.table("db1.data_dest”)
data2_df = spark.read.table("db2.data_source”)
print(data1_df.count())
print(data2_df.count())
if __name__ == "__main__":
validate_data()
Now this code works fine when run on jupyter notebook on sagemaker ( connecting to EMR )
but when we are running as a python script on terminal, its throwing this error
Error message
AttributeError: 'SparkContext' object has no attribute 'read'
We have to automate these notebooks, so we are trying to convert them to python scripts

You can only call read on a Spark Session, not on a Spark Context.
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("app")
spark = SparkSession.builder.config(conf=conf)
Or you can convert the Spark context to a Spark session
conf = SparkConf().setAppName("app")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

Fetching data from REST API to Spark Dataframe using Pyspark

i am building a datapipeline which consume data from RESTApi in json format and pushed to Spark Dataframe. Spark Version: 2.4.4
but getting error as
df = SQLContext.jsonRDD(rdd)
AttributeError: type object 'SQLContext' has no attribute 'jsonRDD'
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
from pyspark import SQLContext
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = spark.sparkContext.parallelize(mylist)
df = SQLContext.jsonRDD(rdd)
return df
url = "https://mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
if there is any other better way to query RestApi and bring data to Spark Dataframe using Pyspark.
I am not sure if i am missing something.

Check Spark Rest API Data source. One advantage with this library is it will use multiple executors to fetch data rest api & create data frame for you.
In your code, you are fetching all data into the driver & creating DataFrame, It might fail with heap space if you have very huge data.
url = "https://mylink"
options = { 'url' : url, 'method' : 'GET', 'readTimeout' : '10000', 'connectionTimeout' : '2000', 'partitions' : '10'}
# Now we create the Dataframe which contains the result from the call to the API
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()

Calling JSON data from Restapi in pyspark throwing Error

i am trying query REST API to get a data to dataframe using pyspark.
but it is throwing error as
File "C:/Users/QueryRestapi.py", line 30, in <module>
df = parse_dataframe(json_data)
File "C:/Users/QueryRestapi.py", line 22, in parse_dataframe
rdd = SparkContext.parallelize(mylist)
TypeError: unbound method parallelize() must be called with SparkContext instance as first argument (got list instance instead)
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = SparkContext.parallelize(mylist)
df = sqlContext.jsonRDD(rdd)
return df
url = "https://"mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
Please help me, if i am missing something.......Thanks a lot

Error while using dataframe show method in pyspark

I am trying to read data from BigQuery using pandas and pyspark. I am able to get the data but somehow getting below error while converting it into Spark DataFrame.
py4j.protocol.Py4JJavaError: An error occurred while calling o28.showString.
: java.lang.IllegalStateException: Could not find TLS ALPN provider; no working netty-tcnative, Conscrypt, or Jetty NPN/ALPN available
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.defaultSslProvider(GrpcSslContexts.java:258)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.configure(GrpcSslContexts.java:171)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.forClient(GrpcSslContexts.java:120)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.buildTransportFactory(NettyChannelBuilder.java:401)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.AbstractManagedChannelImplBuilder.build(AbstractManagedChannelImplBuilder.java:444)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createSingleChannel(InstantiatingGrpcChannelProvider.java:223)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createChannel(InstantiatingGrpcChannelProvider.java:169)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.getTransportChannel(InstantiatingGrpcChannelProvider.java:156)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.ClientContext.create(ClientContext.java:157)
Following is the environment detail
Python version : 3.7
Spark version : 2.4.3
Java version : 1.8
The code is as follow
import google.auth
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession , SQLContext
from google.cloud import bigquery
# Currently this only supports queries which have at least 10 MB of results
QUERY = """ SELECT * FROM test limit 1 """
#spark = SparkSession.builder.appName('Query Results').getOrCreate()
sc = pyspark.SparkContext()
bq = bigquery.Client()
print('Querying BigQuery')
project_id = ''
query_job = bq.query(QUERY,project=project_id)
# Wait for query execution
query_job.result()
df = SQLContext(sc).read.format('bigquery') \
.option('dataset', query_job.destination.dataset_id) \
.option('table', query_job.destination.table_id)\
.option("type", "direct")\
.load()
df.show()
I am looking some help to solve this issue.

I managed to find the better solution referencing this link , below is my working code :
Install pandas_gbq package in python library before writing below code .
import pandas_gbq
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
project_id = "<your-project-id>"
query = """ SELECT * from testSchema.testTable"""
athletes = pandas_gbq.read_gbq(query=query, project_id=project_id,dialect = 'standard')
# Get a reference to the Spark Session
sc = SparkContext()
spark = SparkSession(sc)
# convert from Pandas to Spark
sparkDF = spark.createDataFrame(athletes)
# perform an operation on the DataFrame
print(sparkDF.count())
sparkDF.show()
Hope it helps to someone ! Keep pysparking :)

AttributeError: 'StructField' object has no attribute '_get_object_id': with loading parquet file with custom schema

I am trying to read group of parquet files using PySpark using custom schema but it gives AttributeError: 'StructField' object has no attribute '_get_object_id' error.
Here is my sample code:
import pyspark
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import Row
import pyspark.sql.functions as func
from pyspark.sql.types import *
sc = pyspark.SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
l = [('1',31200,'Execute',140,'ABC'),('2',31201,'Execute',140,'ABC'),('3',31202,'Execute',142,'ABC'),
('4',31103,'Execute',149,'DEF'),('5',31204,'Execute',145,'DEF'),('6',31205,'Execute',149,'DEF')]
rdd = sc.parallelize(l)
trades = rdd.map(lambda x: Row(global_order_id=int(x[0]), nanos=int(x[1]),message_type=x[2], price=int(x[3]),symbol=x[4]))
trades_df = sqlContext.createDataFrame(trades)
trades_df.printSchema()
trades_df.write.parquet('trades_parquet')
trades_df_Parquet = sqlContext.read.parquet('trades_parquet')
trades_df_Parquet.printSchema()
# The schema is encoded in a string.
schemaString = "global_order_id message_type nanos price symbol"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
trades_df_Parquet_n = spark.read.format('parquet').load('trades_parquet',schema,inferSchema =False)
#trades_df_Parquet_n = spark.read.parquet('trades_parquet',schema)
trades_df_Parquet_n.printSchema()
Can any one please help me with your suggestion.

Specify the name of the option schema so it knows it's not format:
Signature: trades_df_Parquet_n.load(path=None, format=None, schema=None, **options)
You get:
trades_df_Parquet_n = spark.read.format('parquet').load('trades_parquet',schema=schema, inferSchema=False)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Convert CSV file to Json using pyspark - apache-spark

df = spark.read.options(header=True,inferSchema=True,delimiter=",").csv("filename.tsv") df1 = df.toJSON() df_final = spark.read.json(df1) df_final.toPandas().to_json("filename.json",orient='records')

Related

getting error while trying to read athena table in spark

Fetching data from REST API to Spark Dataframe using Pyspark

Calling JSON data from Restapi in pyspark throwing Error

Error while using dataframe show method in pyspark

AttributeError: 'StructField' object has no attribute '_get_object_id': with loading parquet file with custom schema

Categories

Resources