Python OneLogin: How to get all events using get_events? - python-3.x

I'm using Python 3 to get information about past events through the OneLogin API. I use the onelogin-python-sdk, which I got from GitHub.
I can get events with get_events. However, only the number of data items specified in max_results can be retrieved. What should I do to get data that can reach tens of thousands?
Should I use another API?
import json
import csv
from optparse import OptionParser
from onelogin.api.client import OneLoginClient
query_parameters = {}
def get_options():
# Analyze Options
return options
def format_eventdata(event_param):
#Formatting Dadta
return(event_data)
def main():
options = get_options()
client = OneLoginClient(options.client_id, options.client_secret, 'US')
events = client.get_events(query_parameters)
with open(options.file, 'w', newline='') as f:
writer = csv.writer(f)
idx = 0
if events:
for data in events:
eventdata=events[idx]
csv_data = format_eventdata(eventdata)
writer.writerow(csv_data)
idx += 1
else:
print("end")
if __name__ == '__main__':
main()

I found a solution by myself.
Adjusted the query parameters to reduce the amount of data retrieved at one time.
Thank you.

Related

Unexpected Multithreading Output when Web Scraping with Selenium (Python)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
import time
# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def list_to_csv(summoner_info):
summoner_info = set([tuple(summoner) for summoner in summoner_info])
with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
for summoner in summoner_info:
f.write(f"{summoner[0]},{summoner[1]},{summoner[2]}\n")
def gather_summoner_info(url):
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split('\n')
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
else:
pass
return summoner_info
def get_summoner_data(page_count, regions):
links = [f'https://u.gg/leaderboards/ranking?region={region}&page={page + 1}' for page in range(page_count) for
region in regions]
# Gather all the relevant summoner information on the page
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
for url, future in future_results.items():
#print(future.result())
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
get_summoner_data(page_count, regions)
if __name__ == '__main__':
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)
Issue: Code is returning the same output for each iteration (The first link of the links list)
Above the following code pulls some information from the links variable using selenium. The issue is that when the threads are executing in the get_summoner_data() function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info() call.
Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.
Approach
Try running without --headless option. You will see what's going on.
Problem
You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.
Fix
Simple fix is to create a driver instance for every thread.
You can do this by moving the line creating a web driver into the thread task function gather_summoner_info as below. I tried with this fix and it works correctly.
def gather_summoner_info(url):
##### moved ######
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
##################
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, "rt-tr")
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split("\n")
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
else:
pass
return summoner_info
Further Consideration
As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.
For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.
import json
import time
from concurrent.futures import ThreadPoolExecutor
import requests
def list_to_csv(summoner_info):
summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
with open("result.csv", "w", encoding="utf-8") as f:
f.write("\n".join([",".join(item) for item in summoner_info]))
def gather_summoner_info(region: str):
payload = json.dumps(
{
"operationName": "getRankedLeaderboard",
"variables": {"page": 1, "queueType": 420, "regionId": region},
"query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n totalPlayerCount\n topPlayerMostPlayedChamp\n players {\n iconId\n losses\n lp\n overallRanking\n rank\n summonerLevel\n summonerName\n tier\n wins\n __typename\n }\n __typename\n }\n}\n",
}
)
headers = {"Content-Type": "application/json"}
response = requests.post("https://u.gg/api", headers=headers, data=payload)
summoner_info = []
data = response.json()
for player in data["data"]["leaderboardPage"]["players"]:
summoner_info.append((player["summonerName"], player["tier"], player["lp"]))
return summoner_info
def get_summoner_data(page_count, regions):
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
for _, future in future_results.items():
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
get_summoner_data(page_count, regions)
if __name__ == "__main__":
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)

How to determine the number of columns per row variably in a CSV File with Python?

I am analyzing xml-structured Textfiles about insider dealings. I wrote some code to parse through the XML-structure and write my output in a CSV file. The results of the files are written per line and the analyzed information is written in individual columns. But in some files information is present in multiple times and my code override the information in the cells, in the end only one date is in the cell of my CSV-File.
import csv
import glob
import re
import string
import time
import bs4 as bs
# User defined directory for files to be parsed
TARGET_FILES = r'D:\files\'
# User defined file pointer to LM dictionary
# User defined output file
OUTPUT_FILE = r'D:\ouput\Parser.csv'
# Setup output
OUTPUT_FIELDS = [r'Datei', 'transactionDate', r'transactionsCode', r'Director', r'Officer', r'Titel', r'10-% Eigner', r'sonstiges', r'SignatureDate']
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n', delimiter=';')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
soup = bs.BeautifulSoup(f_in, 'xml')
output_data = get_data(soup)
output_data[0] = file
wr.writerow(output_data)
def get_data(soup):
# overrides the transactionDate if more than one transactions disclosed on the current form
# the number determine the column for the output
_odata = [0] * 9
try:
for item in soup.find_all('transactionDate'):
_odata[1] = item.find('value').text
except AttributeError:
_odata[1] = ('keine Angabe')
try:
for item in soup.find_all('transactionAcquiredDisposedCode'):
_odata[2] = item.find('value').text
except AttributeError:
_odata[2] = 'ka'
for item in soup.find_all('reportingOwnerRelationship'):
try:
_odata[3] = item.find('isDirector').text
except AttributeError:
_odata[3] = ('ka')
try:
_odata[4] = item.find('isOfficer').text
except AttributeError:
_odata[4] = ('ka')
try:
_odata[5] = item.find('officerTitle').text
except AttributeError:
_odata[5] = 'ka'
try:
_odata[6] = item.find('isTenPercentOwner').text
except AttributeError:
_odata[6] = ('ka')
try:
_odata[7] = item.find('isOther').text
except AttributeError:
_odata[7] = ('ka')
try:
for item in soup.find_all('ownerSignature'):
_odata[8] = item.find('signatureDate').text
except AttributeError:
_odata[8] = ('ka')
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nGeneric_Parser.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
Actually the code works, but overwrites columns if, for e.g. more than one transacion date is given in the file. So I need a code that automatically uses the next column for each transaction date. How could this work?
I would be glad if someone have a solution for my problem. Thanks a lot!
Your issue is that you are iterating over the result of
soup.find_all()
and every time you are writing to the same value. You need to do something with
_odata in each iteration, otherwise you will only end up with whatever is written to it the last time.
If you can show us what the data you're trying to parse actually looks like, perhaps we could give a more specific answer.

EOF error when using Pool

In my code, I am trying to use multiprocessing to find the max price of each coin given a URL. There are around 1400 coins that I have to get data for, so I implemented Python's multiprocessing Pool. I'm not sure if I am using it correctly, but I followed the example given from this website: https://docs.python.org/3.4/library/multiprocessing.html?highlight=process
Here is my code:
import requests
import json
from bs4 import BeautifulSoup
from multiprocessing import Pool
max_prices = []
def find_max (url):
# finds maximum price of a coin
r = requests.get(url)
cont = r.json()
prices = list(map(lambda x: x[1], cont["price_usd"]))
maxPrice = max(prices)
return maxPrice
with open("coins.txt", "r") as f:
data = json.load(f)
coin_slug = [d["slug"] for d in data]
coin_names = [d["name"] for d in data]
urls = []
for item in coin_slug:
url = "https://graphs2.coinmarketcap.com/currencies/"+item+"/"
urls.append(url)
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
When I added this part of the code, it gave me an EOF error:
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(find_max, urls)
You have unbalanced brackets in the last line. It should be
print(p.map(find_max, urls)).

Python: Facebook Graph API - batch request

I want to make a batch request getting campaigns for a specific ad account. I created a simple code based on this issue
but I've used some global arrays and I don't know if time.sleep(2) is necessary for this code. My code is as below:
from facebookads import FacebookAdsApi
from facebookads.api import FacebookRequest
import pandas as pd
import time
batch_body_responses = []
list_of_artists = [1]
def success_callback(response):
try:
pair = [response.json()['data']]
next = [response.json()['paging']['next']]
batch_body_responses.append(pair)
batch_body_responses.append(next)
except IndexError:
pass
except UnicodeEncodeError:
pass
def error_callback(response):
pass
def generate_batches(iterable, batch_size_limit):
# This function can be found in examples/batch_utils.py
batch = []
for item in iterable:
if len(batch) == batch_size_limit:
yield batch
batch = []
batch.append(item)
if len(batch):
yield batch
def get_id_list(art_search_list):
batches = []
your_app_id = '756885'
your_app_secret = '123456789'
your_access_token = 'EAA.....'
api = FacebookAdsApi.init(your_app_id, your_app_secret, your_access_token)
batch_limit = 25
for batch in generate_batches(art_search_list, batch_limit):
next_batch = api.new_batch()
for artt in batch:
requestss = [FacebookRequest(node_id='act_1234/campaigns',method="GET",endpoint="?fields=id,name")]
for req in requestss:
next_batch.add_request(req, success_callback, error_callback)
batches.append(next_batch)
for batch_request in batches:
batch_request.execute()
time.sleep(2)
print(batch_body_responses)
return batch_body_responses
df = pd.DataFrame(get_id_list(list_of_artists))
How can this code optimized by not using global arrays and how to execute without sleep statement and why it is needed sleep?

Unicode characters to Turkish characters

(Edit: my original question is posted here, but the issue has been resolved and the code below is correct). I am looking for advice on how to convert Unicode characters to Turkish characters. The following code (posted online) scrapes tweets for an individual user and outputs a csv file, but the Turkish characters come out as in Unicode characters, i.e. \xc4. I am using Python 3 on a mac.
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
import tweepy #https://github.com/tweepy/tweepy
import csv
import string
import print
#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text)] for tweet in alltweets]
write the csv
with open('%s_tweets.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","text"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
pass in the username of the account you want to download
get_all_tweets("")
The csv module docs recommend you specify the encoding when you open the file. (and also that you use newline='' so the CSV module can do its own handling for newlines). Don't encode Unicode strings when writing rows.
import csv
with open('test.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['id','created_at','text'])
writer.writerows([[123, 456, 'Äβç']])

Resources