I am doing twitter streaming data by kafka. I managed to stream the data and consume the twitter json. But now how do i create a pyspark dataframe containing the twitter data and the search keyword?
Below is how i write the kafka producer
I managed to create the dataframe of what data i want from the twitter object. But i don't know how to get the search keyword.
class StdOutListener(StreamListener):
def __init__(self, producer):
self.producer_obj = producer
#on_status is activated whenever a tweet has been heard
def on_data(self, data):
try:
self.producer_obj.send("twitterstreamingdata", data.encode('utf-8'))
print(data)
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
# When an error occurs
def on_error(self, status):
print (status)
return True
# When reach the rate limit
def on_limit(self, track):
# Print rate limiting error
print("Rate limited, continuing")
# Continue mining tweets
return True
# When timed out
def on_timeout(self):
# Print timeout message
print(sys.stderr, 'Timeout...')
# Wait 10 seconds
time.sleep(120)
return True # To continue listening
def on_disconnect(self, notice):
#Called when twitter sends a disconnect notice
return
if __name__ == '__main__':
spark = SparkSession \
.builder \
.appName("Kafka Producer Application") \
.getOrCreate()
#This is the initialization of Kafka producer
producer = KafkaProducer(bootstrap_servers='xx.xxx.xxx.xxx:9092')
#This handles twitter auth and the conn to twitter streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener(producer))
print("Kafka Producer Application: ")
WORDS = input("Enter any words: ")
print ("Is this what you just said?", WORDS)
word = [u for u in WORDS.split(',')]
#This line filter twitter stream to capture data by keywords
stream.filter(track=word)
One way to resolve your problem it's changing StdOutListener class constructor to receive "keyword" parameter and add "keyword" to JSON in "on_data" function to send to Kafka
import json
import sys
import time
from kafka import KafkaProducer
from pyspark.sql import SparkSession
from tweepy import StreamListener, Stream, OAuthHandler
class StdOutListener(StreamListener):
def __init__(self, producer: KafkaProducer = None, keyword=None):
super(StreamListener, self).__init__()
self.producer = producer
self.keyword = keyword
# on_status is activated whenever a tweet has been heard
def on_data(self, data):
try:
data = json.loads(data)
data['keyword'] = self.keyword
data = json.dumps(data)
self.producer.send("twitterstreamingdata", data.encode('utf-8'))
return True
except BaseException as e:
print("Error on_data: %s" % str(e))
return True
# When an error occurs
def on_error(self, status):
print(status)
return True
# When reach the rate limit
def on_limit(self, track):
# Print rate limiting error
print("Rate limited, continuing")
# Continue mining tweets
return True
# When timed out
def on_timeout(self):
# Print timeout message
print(sys.stderr, 'Timeout...')
# Wait 10 seconds
time.sleep(120)
return True # To continue listening
def on_disconnect(self, notice):
# Called when twitter sends a disconnect notice
return
if __name__ == '__main__':
CONSUMER_KEY = 'YOUR CONSUMER KEY'
CONSUMER_SECRET = 'YOUR CONSUMER SECRET'
ACCESS_TOKEN = 'YOUR ACCESS TOKEN'
ACCESS_SECRET = 'YOUR ACCESS SECRET'
print("Kafka Producer Application: ")
words = input("Enter any words: ")
print("Is this what you just said?", words)
word = [u for u in words.split(',')]
spark = SparkSession \
.builder \
.appName("Kafka Producer Application") \
.getOrCreate()
# This is the initialization of Kafka producer
kafka_producer = KafkaProducer(bootstrap_servers='35.240.157.219:9092')
# This handles twitter auth and the conn to twitter streaming API
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
stream = Stream(auth, StdOutListener(producer=kafka_producer, keyword=word))
stream.filter(track=word)
Hope it helps you!
Related
I have trouble using Flask socketio. Here is the code I use:
import json
import time
from flask import Flask, render_template
from flask_socketio import SocketIO, emit
from engineio.async_drivers import gevent
from flask_cors import CORS
from gevent.pywsgi import WSGIServer
from geventwebsocket.handler import WebSocketHandler
app = Flask(__name__)
app.config['SECRET_KEY'] = 'secret!'
socketio = SocketIO(app, cors_allowed_origins="*")
import queue
queue_notification_thread = queue.Queue()
def callback_notification(data):
print("callback device {}".format(data))
notification_thread = threading.Thread(target=notification_job, args=(data))
notification_thread.start()
queue_notification_thread.put(notification_thread)
def notification_job(data):
print("callback device in notification job {}".format(data))
socketio.emit("notification", data, broadcast=True)
#socketio.on('request')
def handle_message(data):
Logger.instance().debug('received message: {}'.format(data))
try:
if data.__contains__('data'):
response_message = dict()
response_message['Devices'] = dict()
response_message['Devices']['event'] = 'MY_EVENT'
socketio.emit('notification', response_message, broadcast=True)
else:
Logger.instance().error('Can\'t parse data {}'.format(data))
except OSError as err:
print('Error: when process {} \n ValueError {}'.format(data, err))
#socketio.on_error_default # handles all namespaces without an explicit error handler
def default_error_handler(e):
print('An error occured:')
print(e)
if __name__ == '__main__':
serialReader = SerialReader()
serialReader.start_reading(callback_notification)
http_server = WSGIServer(('', 5000), app, handler_class=WebSocketHandler)
http_server.serve_forever()
And the reader with call asynchronisly:
class SerialController:
serial_port: str
serial_device: serial.Serial
reading_thread: threading.Thread
device_name: str
def __init__(self, serial_port: str = "/dev/ttyACM0", baudrate=115200, read_timeout=0.2, device_name=''):
self.serial_port = serial_port
self.device_name = device_name
self.serial_device = serial.Serial(port=self.serial_port, baudrate=baudrate, timeout=0.2)
def start_reading(self, callback_function):
self.reading_callback = callback_function
# run the thread to
self.reading_thread = threading.Thread(target=self.read_job)
self.reading_thread.start()
def read_job(self):
available_data = 0
while True:
if self.serial_device.in_waiting > available_data:
available_data = self.serial_device.in_waiting
print('available_data {}'.format(available_data))
time.sleep(0.1)
else:
if available_data != 0:
data = self.serial_device.readall()
available_data = 0
if data != b'' and data != b'\n':
if self.reading_callback != None:
message = dict()
message["Reader"] = dict()
message["Reader"]["device"] = self.device_name
message["Reader"]["data"] = data
self.reading_callback(message)
time.sleep(1)
When I receive a message in #socketio.on('request') the bradcast emission work properly with no delay. When I use callback_notification called from my serial reader the breadcast emission have variable delay ( from 1seconde to 10 secondes).
On my server the message "callback device ..." is printed instantly but the client receive the message after few second.
I tried to the emission call in a thread like in the shown code but there is no improvment
I want to publish messages to a Pub/Sub topic with some attributes thanks to Dataflow Job in batch mode.
My dataflow pipeline is write with python 3.8 and apache-beam 2.27.0
It works with the #Ankur solution here : https://stackoverflow.com/a/55824287/9455637
But I think it could be more efficient with a shared Pub/Sub Client : https://stackoverflow.com/a/55833997/9455637
However an error occurred:
return StockUnpickler.find_class(self, module, name) AttributeError:
Can't get attribute 'PublishFn' on <module 'dataflow_worker.start'
from
'/usr/local/lib/python3.8/site-packages/dataflow_worker/start.py'>
Questions:
Would the shared publisher implementation improve beam pipeline performance?
Is there another way to avoid pickling error on my shared publisher client ?
My Dataflow Pipeline :
import apache_beam as beam
from apache_beam.io.gcp import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from google.cloud.pubsub_v1 import PublisherClient
import json
import argparse
import re
import logging
class PubsubClient(PublisherClient):
def __reduce__(self):
return self.__class__, (self.batch_settings,)
# The DoFn to perform on each element in the input PCollection.
class PublishFn(beam.DoFn):
def __init__(self):
from google.cloud import pubsub_v1
batch_settings = pubsub_v1.types.BatchSettings(
max_bytes=1024, # One kilobyte
max_latency=1, # One second
)
self.publisher = PubsubClient(batch_settings)
super().__init__()
def process(self, element, **kwargs):
future = self.publisher.publish(
topic=element["topic"],
data=json.dumps(element["data"]).encode("utf-8"),
**element["attributes"],
)
return future.result()
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--source_table_id",
dest="source_table_id",
default="",
help="BigQuery source table <project>.<dataset>.<table> with columns (topic, attributes, data)",
)
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
# pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
bq_source_table = known_args.source_table_id
bq_table_regex = r"^(?P<PROJECT_ID>[a-zA-Z0-9_-]*)[\.|\:](?P<DATASET_ID>[a-zA-Z0-9_]*)\.(?P<TABLE_ID>[a-zA-Z0-9_-]*)$"
regex_match = re.search(bq_table_regex, bq_source_table)
if not regex_match:
raise ValueError(
f"Bad BigQuery table id : `{bq_source_table}` please match {bq_table_regex}"
)
table_ref = bigquery.TableReference(
projectId=regex_match.group("PROJECT_ID"),
datasetId=regex_match.group("DATASET_ID"),
tableId=regex_match.group("TABLE_ID"),
)
with beam.Pipeline(options=pipeline_options) as p:
(
p
| "ReadFromBqTable" #
>> bigquery.ReadFromBigQuery(table=table_ref, use_json_exports=True) # Each row contains : topic / attributes / data
| "PublishRowsToPubSub" >> beam.ParDo(PublishFn())
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
After fussing with this a bit, I think I have an answer that works consistently and is, if not world-beatingly performant, at least tolerably usable:
import logging
import apache_beam as beam
from apache_beam.io.gcp.pubsub import PubsubMessage
from google.cloud.pubsub_v1 import PublisherClient
from google.cloud.pubsub_v1.types import (
BatchSettings,
LimitExceededBehavior,
PublishFlowControl,
PublisherOptions,
)
class PublishClient(PublisherClient):
"""
You have to override __reduce__ to make PublisherClient pickleable 😡 😤 🤬
Props to 'Ankur' and 'Benjamin' on SO for figuring this part out; god knows
I would not have...
"""
def __reduce__(self):
return self.__class__, (self.batch_settings, self.publisher_options)
class PubsubWriter(beam.DoFn):
"""
beam.io.gcp.pubsub does not yet support batch operations, so
we do this the hard way. it's not as performant as the native
pubsubio but it does the job.
"""
def __init__(self, topic: str):
self.topic = topic
self.window = beam.window.GlobalWindow()
self.count = 0
def setup(self):
batch_settings = BatchSettings(
max_bytes=1e6, # 1MB
# by default it is 10 ms, should be less than timeout used in future.result() to avoid timeout
max_latency=1,
)
publisher_options = PublisherOptions(
enable_message_ordering=False,
# better to be slow than to drop messages during a recovery...
flow_control=PublishFlowControl(limit_exceeded_behavior=LimitExceededBehavior.BLOCK),
)
self.publisher = PublishClient(batch_settings, publisher_options)
def start_bundle(self):
self.futures = []
def process(self, element: PubsubMessage, window=beam.DoFn.WindowParam):
self.window = window
self.futures.append(
self.publisher.publish(
topic=self.topic,
data=element.data,
**element.attributes,
)
)
def finish_bundle(self):
"""Iterate over the list of async publish results and block
until all of them have either succeeded or timed out. Yield
a WindowedValue of the success/fail counts."""
results = []
self.count = self.count + len(self.futures)
for fut in self.futures:
try:
# future.result() blocks until success or timeout;
# we've set a max_latency of 60s upstairs in BatchSettings,
# so we should never spend much time waiting here.
results.append(fut.result(timeout=60))
except Exception as ex:
results.append(ex)
res_count = {"success": 0}
for res in results:
if isinstance(res, str):
res_count["success"] += 1
else:
# if it's not a string, it's an exception
msg = str(res)
if msg not in res_count:
res_count[msg] = 1
else:
res_count[msg] += 1
logging.info(f"Pubsub publish results: {res_count}")
yield beam.utils.windowed_value.WindowedValue(
value=res_count,
timestamp=0,
windows=[self.window],
)
def teardown(self):
logging.info(f"Published {self.count} messages")
The trick is that if you call future.result() inside the process() method, you will block until that single message is successfully published, so instead collect a list of futures and then at the end of the bundle make sure they're all either published or definitively timed out. Some quick testing with one of our internal pipelines suggested that this approach can publish 1.6M messages in ~200s.
I learnt that Twitter has stopped providing JSON for deleted tweets.I am trying to get past this limitation by using a polling method to see if tweet is deleted.
But my code still fails. I would appreciate it if you can help me figure out what I am missing.
import sys
import json
import tweepy
from tweepy import Stream
from tweepy.streaming import StreamListener
import datetime
import time
from polling import TimeoutException, poll
# Go to http://apps.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key = 'xx'
consumer_secret = 'xx'
access_token = 'xx'
access_token_secret = 'xx'
# Set up the authorisation to use the Twitter API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# Handle the output generated from the stream
class listener(StreamListener):
tweetcount = 0
def on_data(self, data):
# Convert the message to JSON
json_data = json.loads(data)
# text_file = open(json_data['id_str'] + ".json", "w")
# text_file.write(data)
# text_file.close()
if 'id_str' not in json_data:
# If this isn't a status, do nothing.
print("no ID")
else:
#print("Twitter Id ",json_data['id_str'])
#print("User Id ",json_data['user']['id_str'])
if json_data['user']['id_str'] == '51241574': #Associated Press
tweetcount = json_data['user']['statuses_count']
tweet = api.get_status(json_data['id'])
print("Tweet Count ",tweetcount)
print("Account Name ", json_data['user']['name'])
print(tweet.text)
else:
pass
# if 'delete' in json_data:
# print ("DELETED!")
# if json_data['delete']['status']['user_id'] == '51241574':
# deleted_tweet_id =json_data['delete']['status']['id']
# tweetcount -= 1
# print("New Count is ",tweetcount)
# print(deleted_tweet_id)
# deleted_tweet =api.get_status(deleted_tweet_id)
# print(deleted_tweet.text)
#
# else:
# pass
return True
def on_error(self, status):
print("Error status is ",status)
# Start consuming from the stream. This will get all the Tweets & Deletions from the users the user is following.
twitterStream = Stream(auth, listener())
twitterStream.filter(follow=['51241574'], async=True)
# polling method to check if tweet is deleted
try:
user = api.get_user('AP')
poll(lambda: user.statuses_count >= listener.tweetcount > 0, timeout=30, step=1)
print("Tweet was deleted,New Tweet count is ", user.statuses_count)
except Exception as ex:
template = "An exception of type {0} occurred. Arguments:\n{1!r}"
message = template.format(type(ex).__name__, ex.args)
print (message)
When a listener event is fired, the application shows the value in tweet count variable and checks it against the value retrieved from querying the api.
I am running a code that gets tweets from the Twitter API and saving them to a txt file. the code is as follows:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import sentmod as s
#consumer key, consumer secret, access token, access secret.
ckey= "xxxxxxxxxxxxxxxxxxx"
csecret="xxxxxxxxxxxxxxxxxxx"
atoken="xxxxxxxxxxxxxxxxxxx"
asecret="xxxxxxxxxxxxxxxxxxx"
class listener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
tweet = all_data["text"]
sentiment_value, confidence = s.sentiment(tweet)
tweet.encode('ascii', 'ignore')
tweets= open("tweets.txt","a",encoding="utf-8")
tweets.write(tweet)
tweets.write('\n\n\n')
tweets.close()
print(tweet, sentiment_value, confidence)
if confidence*100 >= 60:
output = open("twitter-out.txt","a")
output.write(sentiment_value)
output.write('\n\n\n')
output.close()
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["Car"],languages=['en']) #locations=[]
When I add the tweets to a text file the tweets sometimes get repeated tweets, how can I fix that?
I am newly using RabbitMq for pyspark communication from remote pc through RPC.For testing purpose i have developed a test code which is giving me the error
I have followed RabbitMq doc tutorial for implementing RPC over pyspark
Here is my spark RPC server code
import pika
from tkinter import*
from pyspark.sql import SparkSession
from pyspark import SparkConf,SparkContext
import json
import re
connectionparam=pika.ConnectionParameters(host="localhost")
connection=pika.BlockingConnection(connectionparam)
channel=connection.channel()
channel.queue_declare(queue='rpc_queue')
spark=SparkSession.builder.config("spark.sql.warehouse.dir", "C:\spark\spark-warehouse")\
\
.appName("TestApp").\
enableHiveSupport().getOrCreate()
print("success")
#establishhing chraracter
#sqlstring="SELECT lflow1.LeaseType as LeaseType, lflow1.Status as Status, lflow1.Property as property, lflow1.City as City, lesflow2.DealType as DealType, lesflow2.Area as Area, lflow1.Did as DID, lesflow2.MID as MID from lflow1, lesflow2 WHERE lflow1.Did = lesflow2.MID"
def queryBuilder(sqlval):
print("printing",sqlval)
df=spark.sql(sqlval)
print("printing data frame table")
df.show()
resultlist = df.toJSON().collect()
dumpdata = re.sub(r"\'", "", str(resultlist))
jsondata = json.dumps(dumpdata)
#print(jsondata)
return jsondata
def on_request(ch,method,props, body):
n=body
print("printing request body ",n)
response=queryBuilder(n)
ch.basic_publish(exchange='',
routing_key=props.reply_to,
properties=pika.BasicProperties(correlation_id=props.correlation_id),
body=response
)
ch.basic_ack(delivery_tag=method.delivery_tag)
channel.basic_qos(prefetch_count=1)
channel.basic_consume(on_request,queue='rpc_queue')
print("[x] Awaiting RPC Request")
channel.start_consuming()
master=Tk()
entryval=Entry(master)
entryval.grid(row=0,column=1)
Button(master,text='Quit',command=master.quit).grid(row=3,column=1,sticky=W,pady=50)
mainloop()
and my following RPC client code for remote pyspark application is
import pika
import uuid
class SparkRpcClient(object):
def __init__(self):
self.connection = pika.BlockingConnection(pika.ConnectionParameters(
host='localhost'))
self.channel = self.connection.channel()
result = self.channel.queue_declare(exclusive=True)
self.callback_queue = result.method.queue
self.channel.basic_consume(self.on_response, no_ack=True,
queue=self.callback_queue)
def on_response(self, ch, method, props, body):
if self.corr_id == props.correlation_id:
self.response = body
def call(self, querymsg):
self.response = None
self.corr_id = str(uuid.uuid4())
self.channel.basic_publish(exchange='',
routing_key='rpc_queue',
properties=pika.BasicProperties(
reply_to = self.callback_queue,
correlation_id = self.corr_id,
),
body=querymsg)
while self.response is None:
self.connection.process_data_events()
return int(self.response)
sparkrpc = SparkRpcClient()
sqlstring="SELECT lflow1.LeaseType as LeaseType, lflow1.Status as Status, lflow1.Property as property, lflow1.City as City, lesflow2.DealType as DealType, lesflow2.Area as Area, lflow1.Did as DID, lesflow2.MID as MID from lflow1, lesflow2 WHERE lflow1.Did = lesflow2.MID"
print(" [x] Requesting query")
response = sparkrpc.call(sqlstring)
print(" [.] Got %s" % response)
My server has already received the request string from client and print it but it could not works on my querybuild() function which process the sqlstring and return json data. More over i have requested multiple times and it seems thats individual request has queued in rpc queue but not cleared out.Because if i run only server script i am getting same error. May be i am missing something here can anyone help me to figure it out. i just want to return json data to client
Thanks in advance
Kalyan
You're passing incompatible type (looks like either bytes or bytearray) where str is expected.
You should decode the content to string first.
def queryBuilder(sqlval, enc):
...
df = spark.sql(sqlval.decode(enc))
df.show()
...