Python: Facebook Graph API - batch request - python-3.x

I want to make a batch request getting campaigns for a specific ad account. I created a simple code based on this issue
but I've used some global arrays and I don't know if time.sleep(2) is necessary for this code. My code is as below:
from facebookads import FacebookAdsApi
from facebookads.api import FacebookRequest
import pandas as pd
import time
batch_body_responses = []
list_of_artists = [1]
def success_callback(response):
try:
pair = [response.json()['data']]
next = [response.json()['paging']['next']]
batch_body_responses.append(pair)
batch_body_responses.append(next)
except IndexError:
pass
except UnicodeEncodeError:
pass
def error_callback(response):
pass
def generate_batches(iterable, batch_size_limit):
# This function can be found in examples/batch_utils.py
batch = []
for item in iterable:
if len(batch) == batch_size_limit:
yield batch
batch = []
batch.append(item)
if len(batch):
yield batch
def get_id_list(art_search_list):
batches = []
your_app_id = '756885'
your_app_secret = '123456789'
your_access_token = 'EAA.....'
api = FacebookAdsApi.init(your_app_id, your_app_secret, your_access_token)
batch_limit = 25
for batch in generate_batches(art_search_list, batch_limit):
next_batch = api.new_batch()
for artt in batch:
requestss = [FacebookRequest(node_id='act_1234/campaigns',method="GET",endpoint="?fields=id,name")]
for req in requestss:
next_batch.add_request(req, success_callback, error_callback)
batches.append(next_batch)
for batch_request in batches:
batch_request.execute()
time.sleep(2)
print(batch_body_responses)
return batch_body_responses
df = pd.DataFrame(get_id_list(list_of_artists))
How can this code optimized by not using global arrays and how to execute without sleep statement and why it is needed sleep?

Related

Unexpected Multithreading Output when Web Scraping with Selenium (Python)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
import time
# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def list_to_csv(summoner_info):
summoner_info = set([tuple(summoner) for summoner in summoner_info])
with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
for summoner in summoner_info:
f.write(f"{summoner[0]},{summoner[1]},{summoner[2]}\n")
def gather_summoner_info(url):
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split('\n')
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
else:
pass
return summoner_info
def get_summoner_data(page_count, regions):
links = [f'https://u.gg/leaderboards/ranking?region={region}&page={page + 1}' for page in range(page_count) for
region in regions]
# Gather all the relevant summoner information on the page
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
for url, future in future_results.items():
#print(future.result())
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
get_summoner_data(page_count, regions)
if __name__ == '__main__':
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)
Issue: Code is returning the same output for each iteration (The first link of the links list)
Above the following code pulls some information from the links variable using selenium. The issue is that when the threads are executing in the get_summoner_data() function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info() call.
Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.
Approach
Try running without --headless option. You will see what's going on.
Problem
You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.
Fix
Simple fix is to create a driver instance for every thread.
You can do this by moving the line creating a web driver into the thread task function gather_summoner_info as below. I tried with this fix and it works correctly.
def gather_summoner_info(url):
##### moved ######
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
##################
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, "rt-tr")
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split("\n")
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
else:
pass
return summoner_info
Further Consideration
As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.
For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.
import json
import time
from concurrent.futures import ThreadPoolExecutor
import requests
def list_to_csv(summoner_info):
summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
with open("result.csv", "w", encoding="utf-8") as f:
f.write("\n".join([",".join(item) for item in summoner_info]))
def gather_summoner_info(region: str):
payload = json.dumps(
{
"operationName": "getRankedLeaderboard",
"variables": {"page": 1, "queueType": 420, "regionId": region},
"query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n totalPlayerCount\n topPlayerMostPlayedChamp\n players {\n iconId\n losses\n lp\n overallRanking\n rank\n summonerLevel\n summonerName\n tier\n wins\n __typename\n }\n __typename\n }\n}\n",
}
)
headers = {"Content-Type": "application/json"}
response = requests.post("https://u.gg/api", headers=headers, data=payload)
summoner_info = []
data = response.json()
for player in data["data"]["leaderboardPage"]["players"]:
summoner_info.append((player["summonerName"], player["tier"], player["lp"]))
return summoner_info
def get_summoner_data(page_count, regions):
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
for _, future in future_results.items():
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
get_summoner_data(page_count, regions)
if __name__ == "__main__":
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)

Why serial code is faster than concurrent.futures in this case?

I am using the following code to process some pictures for my ML project and I would like to parallelize it.
import multiprocessing as mp
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
#parallel call of the function
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
ood_id = ex.map(track_ids, seqs)
if I run the code serially it takes 3.0 minutes but for parallel with concurrent, it takes 3.5 minutes.
can someone please explain why is that? and present a way to solve the problem.
btw, I have 12 cores.
Thanks
Here's a brief example of how one might go about profiling multiprocessing code vs serial execution:
from multiprocessing import Pool
from cProfile import Profile
from pstats import Stats
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
def profile_seq():
p = Profile() #one and only profiler instance
p.enable()
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
p.disable()
return Stats(p), seqs
def track_ids_pr(seq):
p = Profile() #profile the child tasks
p.enable()
retval = track_ids(seq)
p.disable()
return (Stats(p, stream="dummy"), retval)
def profile_parallel():
p = Profile() #profile stuff in the main process
p.enable()
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
retvals = ex.map(track_ids_pr, seqs)
p.disable()
s = Stats(p)
out = []
for ret in retvals:
s.add(ret[0])
out.append(ret[1])
return s, out
if __name__ == "__main__":
stat, retval = profile_parallel()
stat.print_stats()
EDIT: Unfortunately I found out that pstat.Stats objects cannot be used normally with multiprocessing.Queue because it is not pickleable (which is needed for the operation of concurrent.futures). Evidently it normally will store a reference to a file for the purpose of writing statistics to that file, and if none is given, it will by default grab a reference to sys.stdout. We don't actually need that reference however until we actually want to print out the statistics, so we can just give it a temporary value to prevent the pickle error, and then restore an appropriate value later. The following example should be copy-paste-able and run just fine rather than the pseudocode-ish example above.
from multiprocessing import Queue, Process
from cProfile import Profile
from pstats import Stats
import sys
def isprime(x):
for d in range(2, int(x**.5)):
if x % d == 0:
return False
return True
def foo(retq):
p = Profile()
p.enable()
primes = []
max_n = 2**20
for n in range(3, max_n):
if isprime(n):
primes.append(n)
p.disable()
retq.put(Stats(p, stream="dummy")) #Dirty hack: set `stream` to something picklable then override later
if __name__ == "__main__":
q = Queue()
p1 = Process(target=foo, args=(q,))
p1.start()
p2 = Process(target=foo, args=(q,))
p2.start()
s1 = q.get()
s1.stream = sys.stdout #restore original file
s2 = q.get()
# s2.stream #if we are just adding this `Stats` object to another the `stream` just gets thrown away anyway.
s1.add(s2) #add up the stats from both child processes.
s1.print_stats() #s1.stream gets used here, but not before. If you provide a file to write to instead of sys.stdout, it will write to that file)
p1.join()
p2.join()

How to publish to Pub/Sub from Dataflow in batch (efficiently)?

I want to publish messages to a Pub/Sub topic with some attributes thanks to Dataflow Job in batch mode.
My dataflow pipeline is write with python 3.8 and apache-beam 2.27.0
It works with the #Ankur solution here : https://stackoverflow.com/a/55824287/9455637
But I think it could be more efficient with a shared Pub/Sub Client : https://stackoverflow.com/a/55833997/9455637
However an error occurred:
return StockUnpickler.find_class(self, module, name) AttributeError:
Can't get attribute 'PublishFn' on <module 'dataflow_worker.start'
from
'/usr/local/lib/python3.8/site-packages/dataflow_worker/start.py'>
Questions:
Would the shared publisher implementation improve beam pipeline performance?
Is there another way to avoid pickling error on my shared publisher client ?
My Dataflow Pipeline :
import apache_beam as beam
from apache_beam.io.gcp import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from google.cloud.pubsub_v1 import PublisherClient
import json
import argparse
import re
import logging
class PubsubClient(PublisherClient):
def __reduce__(self):
return self.__class__, (self.batch_settings,)
# The DoFn to perform on each element in the input PCollection.
class PublishFn(beam.DoFn):
def __init__(self):
from google.cloud import pubsub_v1
batch_settings = pubsub_v1.types.BatchSettings(
max_bytes=1024, # One kilobyte
max_latency=1, # One second
)
self.publisher = PubsubClient(batch_settings)
super().__init__()
def process(self, element, **kwargs):
future = self.publisher.publish(
topic=element["topic"],
data=json.dumps(element["data"]).encode("utf-8"),
**element["attributes"],
)
return future.result()
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--source_table_id",
dest="source_table_id",
default="",
help="BigQuery source table <project>.<dataset>.<table> with columns (topic, attributes, data)",
)
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
# pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
bq_source_table = known_args.source_table_id
bq_table_regex = r"^(?P<PROJECT_ID>[a-zA-Z0-9_-]*)[\.|\:](?P<DATASET_ID>[a-zA-Z0-9_]*)\.(?P<TABLE_ID>[a-zA-Z0-9_-]*)$"
regex_match = re.search(bq_table_regex, bq_source_table)
if not regex_match:
raise ValueError(
f"Bad BigQuery table id : `{bq_source_table}` please match {bq_table_regex}"
)
table_ref = bigquery.TableReference(
projectId=regex_match.group("PROJECT_ID"),
datasetId=regex_match.group("DATASET_ID"),
tableId=regex_match.group("TABLE_ID"),
)
with beam.Pipeline(options=pipeline_options) as p:
(
p
| "ReadFromBqTable" #
>> bigquery.ReadFromBigQuery(table=table_ref, use_json_exports=True) # Each row contains : topic / attributes / data
| "PublishRowsToPubSub" >> beam.ParDo(PublishFn())
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
After fussing with this a bit, I think I have an answer that works consistently and is, if not world-beatingly performant, at least tolerably usable:
import logging
import apache_beam as beam
from apache_beam.io.gcp.pubsub import PubsubMessage
from google.cloud.pubsub_v1 import PublisherClient
from google.cloud.pubsub_v1.types import (
BatchSettings,
LimitExceededBehavior,
PublishFlowControl,
PublisherOptions,
)
class PublishClient(PublisherClient):
"""
You have to override __reduce__ to make PublisherClient pickleable 😡 😤 🤬
Props to 'Ankur' and 'Benjamin' on SO for figuring this part out; god knows
I would not have...
"""
def __reduce__(self):
return self.__class__, (self.batch_settings, self.publisher_options)
class PubsubWriter(beam.DoFn):
"""
beam.io.gcp.pubsub does not yet support batch operations, so
we do this the hard way. it's not as performant as the native
pubsubio but it does the job.
"""
def __init__(self, topic: str):
self.topic = topic
self.window = beam.window.GlobalWindow()
self.count = 0
def setup(self):
batch_settings = BatchSettings(
max_bytes=1e6, # 1MB
# by default it is 10 ms, should be less than timeout used in future.result() to avoid timeout
max_latency=1,
)
publisher_options = PublisherOptions(
enable_message_ordering=False,
# better to be slow than to drop messages during a recovery...
flow_control=PublishFlowControl(limit_exceeded_behavior=LimitExceededBehavior.BLOCK),
)
self.publisher = PublishClient(batch_settings, publisher_options)
def start_bundle(self):
self.futures = []
def process(self, element: PubsubMessage, window=beam.DoFn.WindowParam):
self.window = window
self.futures.append(
self.publisher.publish(
topic=self.topic,
data=element.data,
**element.attributes,
)
)
def finish_bundle(self):
"""Iterate over the list of async publish results and block
until all of them have either succeeded or timed out. Yield
a WindowedValue of the success/fail counts."""
results = []
self.count = self.count + len(self.futures)
for fut in self.futures:
try:
# future.result() blocks until success or timeout;
# we've set a max_latency of 60s upstairs in BatchSettings,
# so we should never spend much time waiting here.
results.append(fut.result(timeout=60))
except Exception as ex:
results.append(ex)
res_count = {"success": 0}
for res in results:
if isinstance(res, str):
res_count["success"] += 1
else:
# if it's not a string, it's an exception
msg = str(res)
if msg not in res_count:
res_count[msg] = 1
else:
res_count[msg] += 1
logging.info(f"Pubsub publish results: {res_count}")
yield beam.utils.windowed_value.WindowedValue(
value=res_count,
timestamp=0,
windows=[self.window],
)
def teardown(self):
logging.info(f"Published {self.count} messages")
The trick is that if you call future.result() inside the process() method, you will block until that single message is successfully published, so instead collect a list of futures and then at the end of the bundle make sure they're all either published or definitively timed out. Some quick testing with one of our internal pipelines suggested that this approach can publish 1.6M messages in ~200s.

Python OneLogin: How to get all events using get_events?

I'm using Python 3 to get information about past events through the OneLogin API. I use the onelogin-python-sdk, which I got from GitHub.
I can get events with get_events. However, only the number of data items specified in max_results can be retrieved. What should I do to get data that can reach tens of thousands?
Should I use another API?
import json
import csv
from optparse import OptionParser
from onelogin.api.client import OneLoginClient
query_parameters = {}
def get_options():
# Analyze Options
return options
def format_eventdata(event_param):
#Formatting Dadta
return(event_data)
def main():
options = get_options()
client = OneLoginClient(options.client_id, options.client_secret, 'US')
events = client.get_events(query_parameters)
with open(options.file, 'w', newline='') as f:
writer = csv.writer(f)
idx = 0
if events:
for data in events:
eventdata=events[idx]
csv_data = format_eventdata(eventdata)
writer.writerow(csv_data)
idx += 1
else:
print("end")
if __name__ == '__main__':
main()
I found a solution by myself.
Adjusted the query parameters to reduce the amount of data retrieved at one time.
Thank you.

EOF error when using Pool

In my code, I am trying to use multiprocessing to find the max price of each coin given a URL. There are around 1400 coins that I have to get data for, so I implemented Python's multiprocessing Pool. I'm not sure if I am using it correctly, but I followed the example given from this website: https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
Here is my code:
import requests
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
max_prices = []
def find_max (url):
# finds maximum price of a coin
r = requests.get(url)
cont = r.json()
prices = list(map(lambda x: x[1], cont["price_usd"]))
maxPrice = max(prices)
return maxPrice
with open("coins.txt", "r") as f:
data = json.load(f)
coin_slug = [d["slug"] for d in data]
coin_names = [d["name"] for d in data]
urls = []
for item in coin_slug:
url = "https://graphs2.coinmarketcap.com/currencies/"+item+"/"
urls.append(url)
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
When I added this part of the code, it gave me an EOF error:
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
You have unbalanced brackets in the last line. It should be
print(p.map(find_max, urls)).

Resources