Fetching data from REST API to Spark Dataframe using Pyspark - apache-spark

i am building a datapipeline which consume data from RESTApi in json format and pushed to Spark Dataframe. Spark Version: 2.4.4
but getting error as
df = SQLContext.jsonRDD(rdd)
AttributeError: type object 'SQLContext' has no attribute 'jsonRDD'
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
from pyspark import SQLContext
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = spark.sparkContext.parallelize(mylist)
df = SQLContext.jsonRDD(rdd)
return df
url = "https://mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
if there is any other better way to query RestApi and bring data to Spark Dataframe using Pyspark.
I am not sure if i am missing something.

Check Spark Rest API Data source. One advantage with this library is it will use multiple executors to fetch data rest api & create data frame for you.
In your code, you are fetching all data into the driver & creating DataFrame, It might fail with heap space if you have very huge data.
url = "https://mylink"
options = { 'url' : url, 'method' : 'GET', 'readTimeout' : '10000', 'connectionTimeout' : '2000', 'partitions' : '10'}
# Now we create the Dataframe which contains the result from the call to the API
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()

Related

Calling JSON data from Restapi in pyspark throwing Error

i am trying query REST API to get a data to dataframe using pyspark.
but it is throwing error as
File "C:/Users/QueryRestapi.py", line 30, in <module>
df = parse_dataframe(json_data)
File "C:/Users/QueryRestapi.py", line 22, in parse_dataframe
rdd = SparkContext.parallelize(mylist)
TypeError: unbound method parallelize() must be called with SparkContext instance as first argument (got list instance instead)
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = SparkContext.parallelize(mylist)
df = sqlContext.jsonRDD(rdd)
return df
url = "https://"mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
Please help me, if i am missing something.......Thanks a lot

Error while using dataframe show method in pyspark

I am trying to read data from BigQuery using pandas and pyspark. I am able to get the data but somehow getting below error while converting it into Spark DataFrame.
py4j.protocol.Py4JJavaError: An error occurred while calling o28.showString.
: java.lang.IllegalStateException: Could not find TLS ALPN provider; no working netty-tcnative, Conscrypt, or Jetty NPN/ALPN available
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.defaultSslProvider(GrpcSslContexts.java:258)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.configure(GrpcSslContexts.java:171)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.GrpcSslContexts.forClient(GrpcSslContexts.java:120)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.netty.shaded.io.grpc.netty.NettyChannelBuilder.buildTransportFactory(NettyChannelBuilder.java:401)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.AbstractManagedChannelImplBuilder.build(AbstractManagedChannelImplBuilder.java:444)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createSingleChannel(InstantiatingGrpcChannelProvider.java:223)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.createChannel(InstantiatingGrpcChannelProvider.java:169)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.InstantiatingGrpcChannelProvider.getTransportChannel(InstantiatingGrpcChannelProvider.java:156)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.ClientContext.create(ClientContext.java:157)
Following is the environment detail
Python version : 3.7
Spark version : 2.4.3
Java version : 1.8
The code is as follow
import google.auth
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession , SQLContext
from google.cloud import bigquery
# Currently this only supports queries which have at least 10 MB of results
QUERY = """ SELECT * FROM test limit 1 """
#spark = SparkSession.builder.appName('Query Results').getOrCreate()
sc = pyspark.SparkContext()
bq = bigquery.Client()
print('Querying BigQuery')
project_id = ''
query_job = bq.query(QUERY,project=project_id)
# Wait for query execution
query_job.result()
df = SQLContext(sc).read.format('bigquery') \
.option('dataset', query_job.destination.dataset_id) \
.option('table', query_job.destination.table_id)\
.option("type", "direct")\
.load()
df.show()
I am looking some help to solve this issue.
I managed to find the better solution referencing this link , below is my working code :
Install pandas_gbq package in python library before writing below code .
import pandas_gbq
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
project_id = "<your-project-id>"
query = """ SELECT * from testSchema.testTable"""
athletes = pandas_gbq.read_gbq(query=query, project_id=project_id,dialect = 'standard')
# Get a reference to the Spark Session
sc = SparkContext()
spark = SparkSession(sc)
# convert from Pandas to Spark
sparkDF = spark.createDataFrame(athletes)
# perform an operation on the DataFrame
print(sparkDF.count())
sparkDF.show()
Hope it helps to someone ! Keep pysparking :)

Pyspark And Cassandra - Extracting Data Into RDD as Fields from Map Field

I have a table with a map field with data that looks as follows from Cassandra,
test_id test_map
1 {tran_id=99, tran_type=sample}
I am attempting to add these fields to the existing RDD that I am pulling this data from as new fields to the exact same key which would look as follows,
test_id test_map tran_id tran_type
1 {tran_id=99, trantype=sample} 99 sample
I'm able to pull the fields fine using spark context but I can't find a good method to transform this field into the RDD as expected above.
Sample Code:
import os
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.datastax.spark:spark-cassandra-connector_2.11:2.3.0 --conf spark.cassandra.connection.host=xxx.xxx.xxx.xxx pyspark-shell'
sc = SparkContext("local", "test")
sqlContext = SQLContext(sc)
def test_df(keys_space_name, table_name):
table_df = sqlContext.read\
.format("org.apache.spark.sql.cassandra")\
.options(table=table_name, keyspace=keys_space_name)\
.load()
return table_df
df_test = test_df("test", "test")
Then to query data I use Spark SQL in such format:
df_test.registerTempTable("dftest")
df = sqlContext.sql(
"""
select * from dftest
"

read.json not working in spark 2.1 as expected

I was using spark 1.3 to read a JSON stream using .jsonRDD. However, when I was using this with 2.1 it did not work as it is deprecated. The updated version is read.json(). However read.json does not seem to work and gives me an error
u"cannot resolve '`availableDocks`' given input columns: [];
The code is given below
ssc = StreamingContext(sc, 60)
streams=ssc.textFileStream('s3://realtime-nyc-bike')
def getSparkSessionInstance(sparkConf):
if ("sparkSessionSingletonInstance" not in globals()):
globals()["sparkSessionSingletonInstance"] = SparkSession \
.builder \
.config(conf=sparkConf) \
.getOrCreate()
return globals()["sparkSessionSingletonInstance"]
def process(time, rdd):
print("========= %s =========" % str(time))
try:
# Get the singleton instance of SparkSession
spark = getSparkSessionInstance(rdd.context.getConf())
# Convert RDD[String] to RDD[Row] to DataFrame
df = spark.read.json(rdd)
# Creates a temporary view using the DataFrame
df.createOrReplaceTempView("station_data")
results=spark.sql("select stationName from station_data where availableDocks > 20")
results.show()
The json is a valid format and verified. Is there a way to specify columns for a json. This was working fine on 1.3 using jsonRDD. The json data can be obtained from https://feeds.citibikenyc.com/stations/stations.json where i am using only the stationBeanList.

Reading data from HDFS on a cluster

I am trying to read data from HDFS on AWS EC2 cluster using Jupiter Notebook. It has 7 nodes. I am using HDP 2.4 and my code is below. The table has millions of rows but the code does not return any rows."ec2-xx-xxx-xxx-xx.compute-1.amazonaws.com" is the server (ambari-server).
from pyspark.sql import SQLContext
sqlContext = HiveContext(sc)
demography = sqlContext.read.load("hdfs://ec2-xx-xx-xxx-xx.compute-1.amazonaws.com:8020/tmp/FAERS/demography_2012q4_2016q1_duplicates_removed.csv", format="com.databricks.spark.csv", header="true", inferSchema="true")
demography.printSchema()
demography.cache()
print demography.count()
But using sc.textFile, I get the correct number of rows
data = sc.textFile("hdfs://ec2-xx-xxx-xxx-xx.compute-1.amazonaws.com:8020/tmp/FAERS/demography_2012q4_2016q1_duplicates_removed.csv")
schema= data.map(lambda x: x.split(",")).first() #get schema
header = data.first() # extract header
data=data.filter(lambda x:x !=header) # filter out header
data= data.map(lambda x: x.split(","))
data.count()
3641865
The answer by Indrajit given here solved my problem. The problem was with the spark-csv jar.

Resources