Spark streaming Twitter API

Spark streaming Twitter API - apache-spark

I start the following .py which create socket and wait for a connection:
def get_tweets():
url = 'https://stream.twitter.com/1.1/statuses/filter.json'
query_data = [('language', 'en'), ('locations', '-130,-20,100,50'),('track','#')]
query_url = url + '?' + '&'.join([str(t[0]) + '=' + str(t[1]) for t in query_data])
response = requests.get(query_url, auth=my_auth, stream=True)
print(query_url, response)
return response
def send_tweets_to_spark(http_resp, tcp_connection):
for line in http_resp.iter_lines():
try:
full_tweet = json.loads(line)
tweet_text = full_tweet['text']
print("Tweet Text: " + tweet_text)
print ("------------------------------------------")
tcp_connection.send(tweet_text + '\n')
except:
e = sys.exc_info()[0]
print("Error: %s" % e)
TCP_IP = '127.0.0.1'
TCP_PORT = 9009
conn = None
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((TCP_IP, TCP_PORT))
s.listen(1)
print("Waiting for TCP connection...")
conn, addr = s.accept()
print("Connected... Starting getting tweets.")
resp = get_tweets()
print(resp, conn)
send_tweets_to_spark(resp, conn)
Then I run spark-submit the following Spark streaming script which should count the tweet every 2 seconds:
def aggregate_tags_count(new_values, total_sum):
return sum(new_values) + (total_sum or 0)
def get_sql_context_instance(spark_context):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
return globals()['sqlContextSingletonInstance']
def process_rdd(time, rdd):
print("----------- %s -----------" % str(time))
sql_context = get_sql_context_instance(rdd.context)
row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
hashtags_df = sql_context.createDataFrame(row_rdd)
hashtags_df.registerTempTable("hashtags")
hashtag_counts_df = sql_context.sql("select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10")
hashtag_counts_df.show()
send_df_to_dashboard(hashtag_counts_df)
conf = SparkConf()
conf.setAppName("TwitterStreamApp")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 2)
ssc.checkpoint("checkpoint_TwitterApp")
dataStream = ssc.socketTextStream("127.0.0.1",9009)
words = dataStream.flatMap(lambda line: line.split(" "))
hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1))
tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
tags_totals.foreachRDD(process_rdd)
ssc.start()
ssc.awaitTermination()
This starts the apps as I can see it in the WEB UI running. My Problem is when I run the spark apps, it hit the first script which send the Tweet but It output empty RDD. The error is as below:
20/12/30 08:53:56 INFO StandaloneAppClient$ClientEndpoint: Executor updated: app-20201230085356-0012/0 is now RUNNING
20/12/30 08:53:57 INFO StandaloneSchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
----------- 2020-12-30 08:54:20 -----------
20/12/30 08:54:24 ERROR JobScheduler: Error running job streaming job 1609318460000 ms.0
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "/opt/spark/python/lib/pyspark.zip/pyspark/streaming/util.py", line 68, in call
r = self.func(t, *rdds)
File "/home/ubuntu/market_risk/utils/spark_twitter_count.py", line 26, in process_rdd
hashtags_df = sql_context.createDataFrame(row_rdd)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/context.py", line 320, in createDataFrame
return self.sparkSession.createDataFrame(data, schema, samplingRatio, verifySchema)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 605, in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 628, in _create_dataframe
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 425, in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio, names=schema)
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 396, in _inferSchema
first = rdd.first()
File "/opt/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1467, in first
raise ValueError("RDD is empty")
ValueError: RDD is empty

Related

An HTTP Client raised an unhandled exception: 'int' object is not callable

I'm facing the issue in two python libraries used in my code, Getting these randomly on prod on some of the pods and not all pods
Trace of The Error
Traceback (most recent call last):\n File \"/usr/local/lib/python3.10/site-packages/botocore/httpsession.py\",
line 448, in send\n
urllib_response = conn.urlopen(\n File \"/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py\",
line 703, in urlopen\n
httplib_response = self._make_request(\n File \"/usr/local/lib/python3.10/site-packages/newrelic/hooks/external_urllib3.py\",
line 32, in _nr_wrapper_make_request_\n
return wrapped(*args, **kwargs)\n
File \"/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py\",
line 386, in _make_request\n self._validate_conn(conn)\n
File \"/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py\",
line 1040, in _validate_conn\n
conn.connect()\n File \"/usr/local/lib/python3.10/site-packages/urllib3/connection.py\",
line 416, in connect\n
self.sock = ssl_wrap_socket(\n File \"/usr/local/lib/python3.10/site-packages/urllib3/util/ssl_.py\",
line 424, in ssl_wrap_socket\n context.set_alpn_protocols(ALPN_PROTOCOLS)\n
File \"/usr/local/lib/python3.10/ssl.py\",
line 566, in set_alpn_protocols\n
if len(b) == 0 or len(b) > 255:\nTypeError: 'int' object is not callable\n\nDuring handling of the above exception, another exception occurred:\n\nTraceback (most recent call last):\n
File \"/code/app/aws_utils/dynamo_db.py\", line 176, in update_multi_attributes\n
response = table.update_item(\n File \"/usr/local/lib/python3.10/site-packages/boto3/resources/factory.py\",
line 580, in do_action\n
response = action(self, *args, **kwargs)\n
File \"/usr/local/lib/python3.10/site-packages/boto3/resources/action.py\",
line 88, in __call__\n response = getattr(parent.meta.client, operation_name)(*args, **params)\n
File \"/usr/local/lib/python3.10/site-packages/botocore/client.py\",
line 514, in _api_call\n
return self._make_api_call(operation_name, kwargs)\n
File \"/usr/local/lib/python3.10/site-packages/botocore/client.py\",
line 921, in _make_api_call\n
http, parsed_response = self._make_request(\n
File \"/usr/local/lib/python3.10/site-packages/botocore/client.py\",
line 944, in _make_request\n
return self._endpoint.make_request(operation_model, request_dict)\n
File \"/usr/local/lib/python3.10/site-packages/newrelic/hooks/external_botocore.py\",
line 108, in _nr_endpoint_make_request_\n
result = wrapped(*args, **kwargs)\n
File \"/usr/local/lib/python3.10/site-packages/botocore/endpoint.py\",
line 119, in make_request\n
return self._send_request(request_dict, operation_model)\n
File \"/usr/local/lib/python3.10/site-packages/botocore/endpoint.py\",
line 231, in _send_request\n
raise exception\n File \"/usr/local/lib/python3.10/site-packages/botocore/endpoint.py\",
line 281, in _do_get_response\n
http_response = self._send(request)\n
File \"/usr/local/lib/python3.10/site-packages/botocore/endpoint.py\",
line 377, in _send\n return self.http_session.send(request)\n
File \"/usr/local/lib/python3.10/site-packages/botocore/httpsession.py\",
line 493, in send\n
raise HTTPClientError(error=e)\nbotocore.exceptions.HTTPClientError: An HTTP Client raised an unhandled exception: 'int' object is not callable\n",
Getting this in all dynamodb calls , adding one of the codes here, exception comes in the packages and not in the code as far as i understood
def update_multi_attributes(
self,
table,
partition_key,
sort_key,
attr_value_dict,
delete_attr=[],
):
try:
if not partition_key or not sort_key:
return None
if not attr_value_dict and len(delete_attr) == 0:
return None
if not attr_value_dict and len(delete_attr) > 0:
return self.delete_attributes(
table=table,
partition_key=partition_key,
sort_key=sort_key,
delete_attr=delete_attr,
)
expression_attribute_values = {}
expression_attribute_names = {}
update_expression = "SET "
remove_attr_expression = "REMOVE "
index = 0
attr_value_dict.pop("partition_key", None)
attr_value_dict.pop("sort_key", None)
for key, value in attr_value_dict.items():
index += 1
var_name = "#var" + str(index)
var_value_name = ":var" + str(index)
expression_attribute_values[var_value_name] = value
expression_attribute_names[var_name] = key
if index < len(attr_value_dict.keys()):
update_expression = (
update_expression + var_name + "=" + var_value_name + ","
)
else:
update_expression = (
update_expression + var_name + "=" + var_value_name
)
index = 0
for attr in delete_attr:
index += 1
var_name = "#rvar" + str(index)
expression_attribute_names[var_name] = attr
if index < len(delete_attr):
remove_attr_expression = remove_attr_expression + var_name + ","
else:
remove_attr_expression = remove_attr_expression + var_name
if len(delete_attr) > 0:
update_expression = update_expression + " " + remove_attr_expression
response = table.update_item(
Key=self.get_key(partition_key, sort_key),
ExpressionAttributeNames=expression_attribute_names,
UpdateExpression=update_expression,
ConditionExpression="attribute_exists(partition_key)"
"and attribute_exists(sort_key)",
ExpressionAttributeValues=expression_attribute_values,
)
return response
except botocore.exceptions.ClientError as e:
logger.info(
"Dynamo: Error Update MultiAttribute boto3{} error{},{}for{},{}".format(
attr_value_dict,
e,
e.response["Error"],
partition_key,
traceback.format_exc(),
)
)
except Exception as e:
logger.info(
"Dynamo: Error Update MultiAttribute {} error {} for {}, {}".format(
attr_value_dict, e, partition_key, traceback.format_exc()
)
)
return None
Adding Package versions used
urllib3-1.26.8
botocore-1.27.77
boto3-1.24.77
Had put a trace print, thinking of upgrading the urllib version but not sure if that would solve the problem, Suddenly this error started coming up, code was working fine for a month after integration.`

How to fix error after creating exe with Pyinsaller?

After creating the file exe, the following error is displayed.
This problem may be related to the TCP/IP protocol.
I don't quite understand what the mistake is.
Traceback (most recent call last):
File "list_queue.py", line 56, in <module>
File "list_queue.py", line 17, in lenth_queue
File "pymqi\__init__.py", line 3024, in connect
File "pymqi\__init__.py", line 1649, in connect_tcp_client
File "pymqi\__init__.py", line 1624, in connect_with_options
pymqi.MQMIError: MQI Error. Comp: 2, Reason 2012: FAILED: MQRC_ENVIRONMENT_ERROR
Although everything works in PyCharm, all the data I enter works and the script works fine.
MyCode:
def lenth_queue():
dict_queue = collections.defaultdict(dict)
queue_manager = input('Enter the name of the queue manager: ')
channel = input('Enter the name of the communication channel: ')
host = input('Enter a name for the IP address of the queue manager: ')
port = input('Enter the name of the queue manager port: ')
conn_info = '%s(%s)' % (host, port)
queue_type = pymqi.CMQC.MQQT_LOCAL
qmgr = pymqi.connect(queue_manager, channel, conn_info)
c = 0
try:
prefix = '*'
pcf = pymqi.PCFExecute(qmgr,response_wait_interval=600000)
attrs = [] # typeList[pymqi.MQOpts]
attrs.append(pymqi.CFST(Parameter=pymqi.CMQC.MQCA_Q_NAME,
String=pymqi.ensure_bytes(prefix)))
attrs.append(pymqi.CFIN(Parameter=pymqi.CMQC.MQIA_Q_TYPE,
Value=queue_type))
attrs.append(pymqi.CFIL(Parameter=pymqi.CMQCFC.MQIACF_Q_ATTRS,
Values=[pymqi.CMQC.MQIA_CURRENT_Q_DEPTH]))
object_filters = []
object_filters.append(
pymqi.CFIF(Parameter=pymqi.CMQC.MQIA_CURRENT_Q_DEPTH,
Operator=pymqi.CMQCFC.MQCFOP_GREATER,
FilterValue=0))
response = pcf.MQCMD_INQUIRE_Q(attrs, object_filters)
for queue_info in response:
queue_name = queue_info[pymqi.CMQC.MQCA_Q_NAME]
queue_depth = queue_info[pymqi.CMQC.MQIA_CURRENT_Q_DEPTH]
dict_queue[queue_name.strip().decode()] = queue_depth
c += 1
writer_queue('Queue_lenth',dict_queue)
return 'File written successfully'
except pymqi.MQMIError as e:
return 'Failed to connect'
def writer_queue(name,dict_q):
txt = io.open(name + ".txt", "w", encoding="utf-8")
for key in dict_q:
txt.write('{}: {} message(s)'.format(key, dict_q[key]) + '\n')
txt.close()
print(lenth_queue())
input('Press ENTER to exit')

Shard ID %s heartbeat blocked for more than 10s seconds. Issue

Hi I'm getting an issue where I get a pretty long error returning Shard ID %s heartbeat blocked for more than 10s seconds.
I originally thought that I could be happening due to the database returning any information because it's empty. I tried to prevent the error by checking for None and returning it but the error still persists.
> Message: 'Shard ID %s heartbeat blocked for more than %s seconds.\nLoop thread traceback (most recent
call last):\n File "/app/run.py", line 270, in <module>\n bot.run(token)\n File
"/app/.jack/python/lib/python3.9/site-packages/discord/client.py", line 713, in run\n
loop.run_forever()\n File "/app/.jack/python/lib/python3.9/asyncio/base_events.py", line 596, in
run_forever\n self._run_once()\n File "/app/.jack/python/lib/python3.9/asyncio/base_events.py",
line 1890, in _run_once\n handle._run()\n File
"/app/.jack/python/lib/python3.9/asyncio/events.py", line 80, in _run\n
self._context.run(self._callback, *self._args)\n File "/app/.jack/python/lib/python3.9/site-
packages/discord/client.py", line 343, in _run_event\n await coro(*args, **kwargs)\n File
"/app/cogs/mods.py", line 339, in mute_check\n cursor.execute("SELECT username FROM blacklist
WHERE username=%s", (user.id, ))\n
> : Arguments: (None, 10)
Here is what I'm working with:
#tasks.loop(seconds=60)
async def mute_check(self):
await self.bot.wait_until_ready()
self.ready = True
guild = self.bot.get_guild(ID)
for member in guild.members:
conn = psycopg2.connect(DATABASE_URL, sslmode='require')
cursor = conn.cursor()
# Changed the query so that NULLs (which will be cast as None) are not returned
cursor.execute("SELECT time FROM blacklist WHERE username=%s AND time IS NOT NULL", (member.id, ))
# no index needed, multiple stamps are returned
results = cursor.fetchall()
for result in results:
if result is None:
return
# first and only returned element
timestamp = result[0]
restricted_role = get(guild.roles, name="Restricted")
datestamp = datetime.now()
datetimestring = str(datestamp.now().strftime("%Y%m%d%H%M%S"))
dateonlystring = timestamp.strftime("%Y%m%d%H%M%S")
if (datetimestring > dateonlystring):
await member.remove_roles(restricted_role, reason='Restricted role removed (auto timer).')
cursor.close()
conn.close()
Help would be greatly appreciated.

Airflow task_id not found

I am trying to set unique id for each of the celery task in airflow. but for some reason it throws me the following error. I even tried with uuid module and the same error pops up.
Traceback (most recent call last):
File "/home/mahesh/anaconda3/bin/airflow", line 37, in <module>
args.func(args)
File "/home/mahesh/anaconda3/lib/python3.7/site-packages/airflow/utils/cli.py", line 76, in wrapper
return f(*args, **kwargs)
File "/home/mahesh/anaconda3/lib/python3.7/site-packages/airflow/bin/cli.py", line 547, in run
task = dag.get_task(task_id=args.task_id)
File "/home/mahesh/anaconda3/lib/python3.7/site-packages/airflow/models/dag.py", line 1263, in get_task
raise TaskNotFound("Task {task_id} not found".format(task_id=task_id))
airflow.exceptions.TaskNotFound: Task 893370 not found
Following is my DAG
dag_id = 'test'
dag = DAG(dag_id, description='test DAG',
schedule_interval=None, start_date=datetime(2018, 11, 1), catchup=False)
def my_sleeping_function(**context):
url = context['url']
r = requests.head(url)
return {url : r.status_code}
def fetch_final_result(**context):
task_instance = context['task_instance']
ans = []
print("paypal is ", uuid_list)
for i in uuid_list:
data = task_instance.xcom_pull(task_ids=i)
ans.append(data)
print("uber is ", ans)
return ans
run_this_bash_last = PythonOperator(
task_id= 'last',
python_callable=fetch_final_result,
# op_kwargs={'url': 'asd'},
dag=dag,
provide_context=True)
urls = [website1, website2, website3, website4, website5, website6]
for i in urls:
index += 1
ind_id = str(random.randint(1, 100000000000000))
uuid_list.append(ind_id)
task_python = PythonOperator(
# task_id = ind_id,
task_id = ind_id,
python_callable=my_sleeping_function,
op_kwargs={'url': i},
dag=dag,
provide_context=True)
task_python.set_downstream(run_this_bash_last)
Please suggest how to overcome this

IMAP4LIB When using the store command I get the error "BAD [b'Could not parse command']"

I am new to all of this so I'm sorry if I mess this up or have already made a mess. I have two classes a GUI and my MailSorter class in the GUI class I have method which logins, then one that fetches all the EmailIds then finally fetches all the From emails and stores it in a dict. which stores the From email and amount of times it appears and an array with the From email and the ID.
def fetchFrom(self,emailIDs):
EmailAmount = dict()
Email = []
count = 0
for emailId in emailIDs:
#Converts email into string
result2,email_data = self.mail.fetch(emailId,'(RFC822)')
try:
raw_email = email_data[0][1].decode("utf-8")
email_message = email.message_from_string(raw_email)
#Fetches email address sent from
From = email_message["From"]
Email.append((From,emailId))
#print(From)
if From in EmailAmount:
EmailAmount[From] = EmailAmount[From] + 1
else:
EmailAmount[From] = 1
count += 1
if count > 10:
break
except Exception as e:
self.log.append((emailId,e))
def mainScreenInterface(self):
#Process
print("Loading program")
EmailIds = self.Mail.fetchEmailId()
EmailDict, self.EmailArray = self.Mail.fetchFrom(EmailIds)
self.master.geometry("750x600")
self.master.title("Main Screen")
self.destoryWidget()
#New Frame
self.mainScreen = tk.Frame(self.master)
self.mainScreen.pack()
#Labels
mainText = tk.Label(self.mainScreen,text = "All Emails")
mainText.config(font=("Courier","25"))
#Buttons
delete = tk.Button(self.mainScreen,text="Delete", command = self.Delete)
deleteAll = tk.Button(self.mainScreen,text="Delete All", command = self.DeleteAll)
Help = tk.Button(self.mainScreen,text="Help", command = self.Help_)
#Scrollbar
scrollbar = tk.Scrollbar(root)
scrollbar.pack(side="right",fill="y")
#Listbox
self.listbox = tk.Listbox(root,width = root.winfo_screenwidth(), height = 25)
#Attach a scrool wheel to the listbox
self.listbox.config(yscrollcommand=scrollbar.set)
scrollbar.config(command=self.listbox.yview)
#Add items to the list box
count = 1
for x,y in EmailDict.items():
self.listbox.insert(count,(x,y))
count += 1
#Placement
paddingValue = 40
mainText.pack(side="top")
self.listbox.pack(side="top")
delete.pack(side="left",padx=paddingValue)
deleteAll.pack(side="left",padx=paddingValue)
Help.pack(side="left",padx=paddingValue)
def Delete(self):
emailName = self.listbox.get(tk.ANCHOR)[0]
self.Mail.deleteEmail(emailName,self.EmailArray)
So the fetchFrom is from the mailSorter class and the other two are the GUI class, when I call the deleteEmail I get the error:
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Python\lib\tkinter\__init__.py", line 1705, in __call__
return self.func(*args)
File "C:\Users\******\Desktop\Email Sorter v3.py", line 197, in Delete
self.Mail.deleteEmail(emailName,self.EmailArray)
File "C:\Users\******\Desktop\Email Sorter v3.py", line 66, in deleteEmail
self.mail.store(Id[1].strip(), '+X-GM-tk.LabelS', '\\Trash')
File "C:\Python\lib\imaplib.py", line 840, in store
typ, dat = self._simple_command('STORE', message_set, command, flags)
File "C:\Python\lib\imaplib.py", line 1196, in _simple_command
return self._command_complete(name, self._command(name, *args))
File "C:\Python\lib\imaplib.py", line 1027, in _command_complete
raise self.error('%s command error: %s %s' % (name, typ, data))
imaplib.IMAP4.error: STORE command error: BAD [b'Could not parse command']
but when I run it as a text base with no GUI and use an example email it all works fine:
test = MailSorter("hamadnassor5#gmail.com","snerfulbubble1.")
test.login()
EmailIds = test.fetchEmailId()
EmailDict, EmailArray = test.fetchFrom(EmailIds)
test.displayEmails(EmailDict)
test.deleteEmail("Xbox <Xbox#outlook.com>",EmailArray)
test.closeCon()

DeleteMail code
def deleteEmail(self, emailName, EmailArray):
for Id in EmailArray:
if Id[0] == emailName:
print(Id[0])
print(emailName)
print(Id[1])
self.mail.store(Id[1].strip(), '+X-GM-tk.LabelS', '\\Trash')

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Spark streaming Twitter API - apache-spark

Related

An HTTP Client raised an unhandled exception: 'int' object is not callable

How to fix error after creating exe with Pyinsaller?

Shard ID %s heartbeat blocked for more than 10s seconds. Issue

Airflow task_id not found

IMAP4LIB When using the store command I get the error "BAD [b'Could not parse command']"

Categories

Resources