i am trying query REST API to get a data to dataframe using pyspark.
but it is throwing error as
File "C:/Users/QueryRestapi.py", line 30, in <module>
df = parse_dataframe(json_data)
File "C:/Users/QueryRestapi.py", line 22, in parse_dataframe
rdd = SparkContext.parallelize(mylist)
TypeError: unbound method parallelize() must be called with SparkContext instance as first argument (got list instance instead)
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = SparkContext.parallelize(mylist)
df = sqlContext.jsonRDD(rdd)
return df
url = "https://"mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
Please help me, if i am missing something.......Thanks a lot
Related
I am facing issue while converting .CSV file to .json file using pyspark.
MyCode
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_pubs = spark.read.options(inferSchema='True',delimiter=',').csv("filename.csv")
#df_pubs_json = df_pubs.write.mode(SaveMode.Overwrite).json("filename.json")
#df_pubs_json = df_pubs.toJSON("filename.json")\
df_pubs.write.format("json").mode("overwrite").save("filename.json")
Error
py4j.protocol.Py4JJavaError: An error occurred while calling o31.save.
: ExitCodeException exitCode=-1073741515:
df = spark.read.options(header=True,inferSchema=True,delimiter=",").csv("filename.tsv")
df1 = df.toJSON()
df_final = spark.read.json(df1)
df_final.toPandas().to_json("filename.json",orient='records')
i am building a datapipeline which consume data from RESTApi in json format and pushed to Spark Dataframe. Spark Version: 2.4.4
but getting error as
df = SQLContext.jsonRDD(rdd)
AttributeError: type object 'SQLContext' has no attribute 'jsonRDD'
Code:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from urllib import urlopen
from pyspark import SQLContext
import json
spark = SparkSession \
.builder \
.appName("DataCleansing") \
.getOrCreate()
def convert_single_object_per_line(json_list):
json_string = ""
for line in json_list:
json_string += json.dumps(line) + "\n"
return json_string
def parse_dataframe(json_data):
r = convert_single_object_per_line(json_data)
mylist = []
for line in r.splitlines():
mylist.append(line)
rdd = spark.sparkContext.parallelize(mylist)
df = SQLContext.jsonRDD(rdd)
return df
url = "https://mylink"
response = urlopen(url)
data = str(response.read())
json_data = json.loads(data)
df = parse_dataframe(json_data)
if there is any other better way to query RestApi and bring data to Spark Dataframe using Pyspark.
I am not sure if i am missing something.
Check Spark Rest API Data source. One advantage with this library is it will use multiple executors to fetch data rest api & create data frame for you.
In your code, you are fetching all data into the driver & creating DataFrame, It might fail with heap space if you have very huge data.
url = "https://mylink"
options = { 'url' : url, 'method' : 'GET', 'readTimeout' : '10000', 'connectionTimeout' : '2000', 'partitions' : '10'}
# Now we create the Dataframe which contains the result from the call to the API
df = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**options).load()
i am running spark 2.4.4 with python 2.7 and IDE is pycharm.
The Input file contain encoded value in some column like given below.
.ʽ|!3-2-704A------------ (dotted line is space)
i am trying to get result like
3-2-704A
I tried below code.
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Python Spark") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
df = spark.read.csv("Customers_v01.csv",header=True,sep=",");
myres = df.map(lambda x :x[1].decode('utf-8'))
print(myres.collect())
Error:
myres = df.map(lambda x :x[1].decode('utf-8'))
File "C:\spark\python\pyspark\sql\dataframe.py", line 1301, in __getattr__
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name))
AttributeError: 'DataFrame' object has no attribute 'map'
i am not sure what cause this error.... kindly help. is there any other way to do it.
map is available on Resilient Distributed Dataset (RDD)
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark").getOrCreate()
df = spark.read.csv("Customers_v01.csv", header=True, sep=",", encoding='utf-8')
myres = df.rdd.map(lambda x: x[1].encode().decode('utf-8'))
print(myres.collect())
I am trying to read group of parquet files using PySpark using custom schema but it gives AttributeError: 'StructField' object has no attribute '_get_object_id' error.
Here is my sample code:
import pyspark
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import Row
import pyspark.sql.functions as func
from pyspark.sql.types import *
sc = pyspark.SparkContext()
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
l = [('1',31200,'Execute',140,'ABC'),('2',31201,'Execute',140,'ABC'),('3',31202,'Execute',142,'ABC'),
('4',31103,'Execute',149,'DEF'),('5',31204,'Execute',145,'DEF'),('6',31205,'Execute',149,'DEF')]
rdd = sc.parallelize(l)
trades = rdd.map(lambda x: Row(global_order_id=int(x[0]), nanos=int(x[1]),message_type=x[2], price=int(x[3]),symbol=x[4]))
trades_df = sqlContext.createDataFrame(trades)
trades_df.printSchema()
trades_df.write.parquet('trades_parquet')
trades_df_Parquet = sqlContext.read.parquet('trades_parquet')
trades_df_Parquet.printSchema()
# The schema is encoded in a string.
schemaString = "global_order_id message_type nanos price symbol"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
trades_df_Parquet_n = spark.read.format('parquet').load('trades_parquet',schema,inferSchema =False)
#trades_df_Parquet_n = spark.read.parquet('trades_parquet',schema)
trades_df_Parquet_n.printSchema()
Can any one please help me with your suggestion.
Specify the name of the option schema so it knows it's not format:
Signature: trades_df_Parquet_n.load(path=None, format=None, schema=None, **options)
You get:
trades_df_Parquet_n = spark.read.format('parquet').load('trades_parquet',schema=schema, inferSchema=False)
I am getting this error
Using Python version 3.5.2+ (default, Sep 22 2016 12:18:14)
SparkSession available as 'spark'.
Traceback (most recent call last):
File "/home/saria/PycharmProjects/TfidfLDA/main.py", line 30, in <module>
corpus = indexed_data.select(col("KeyIndex",str).cast("long"), "features").map(list)
File "/home/saria/tf27/lib/python3.5/site-packages/pyparsing.py", line 956, in col
return 1 if 0<loc<len(s) and s[loc-1] == '\n' else loc - s.rfind("\n", 0, loc)
TypeError: unorderable types: int() < str()
Process finished with exit code 1
when I am running the following code. I should explain that the error happen in this line:
corpus = indexed_data.select(col("KeyIndex",str).cast("long"), "features").map(list)
I reviewd these cases:
enter link description here
enter link description here
but they are about the conversion int and string, specially reading Input.
but here I dont have input,
Explanations abt the code:
this code is doing tfidf+lda using Dataframe
# I used alias to avoid confusion with the mllib library
from pyparsing import col
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import HashingTF as MLHashingTF, Tokenizer, HashingTF, IDF, StringIndexer
from pyspark.ml.feature import IDF as MLIDF
from pyspark.python.pyspark.shell import sqlContext, sc
from pyspark.sql.types import DoubleType, StructField, StringType, StructType
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
dbURL = "hdfs://en.wikipedia.org/wiki/Music"
file = sc.textFile("1.txt")
#Define data frame schema
fields = [StructField('key',StringType(),False),StructField('content',StringType(),False)]
schema = StructType(fields)
#Data in format <key>,<listofwords>
file_temp = file.map(lambda l : l.split(","))
file_df = sqlContext.createDataFrame(file_temp, schema)
#Extract TF-IDF From https://spark.apache.org/docs/1.5.2/ml-features.html
tokenizer = Tokenizer(inputCol='content', outputCol='words')
wordsData = tokenizer.transform(file_df)
hashingTF = HashingTF(inputCol='words',outputCol='rawFeatures',numFeatures=1000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol='rawFeatures',outputCol='features')
idfModel = idf.fit(featurizedData)
rescaled_data = idfModel.transform(featurizedData)
indexer = StringIndexer(inputCol='key',outputCol='KeyIndex')
indexed_data = indexer.fit(rescaled_data).transform(rescaled_data).drop('key').drop('content').drop('words').drop('rawFeatures')
corpus = indexed_data.select(col("KeyIndex",str).cast("long"), "features").map(list)
model = LDA.train(corpus, k=2)
May please give me your idea,
when I delete the str in the error prone line:
corpus = indexed_data.select(col("KeyIndex",str).cast("long"), "features").map(list)
, It throws a new error
TypeError: col() missing 1 required positional argument: 'strg'
Update
My main goal is to run this code :
tfidf then lda