how to fetch data through multiple account with threading in python3 - python-3.x

I want to achieve a function that can fetch data in parallel.
The background is the information of 100 sites can be fetched from site A.
the same account can't be used more than once at a time, so I created 5 different accounts on site A that eanble me to fetch information with 5 accounts.
account info like
worker1 pawd
worker2 pawd
worker3 pawd
worker4 pawd
worker5 pawd
if you want to get information of site B from site A .
then you need to type cmd like get info for siteB_IP on site A.
suppose there are 100 IPs are stored in a list names IPlist
how to fetch information of 100 IPs with 5 avaliable accounts in parallel by threading , and then
all of the information can be sotored in a variable without conflict.
what I have tried is below , below codes can not be executed due to I have no way to achieve the solution:
import threading
user = 'root'
pwd = 'Changeme123'
# the first step is to logon with default account
rs = link.send_cmd(r':lognew:' + '"' + user + '","' + pwd + '"')
# then get all nebor ip from the logon site, the function parse_multi is used for parsing data
IPlist = parse_multi(link.send_cmd('get-IP-info:0xffff'))
def Fetchinfo(user, ip):
rs = link.send_cmd(r':lognew:' + '"' + user + '","' + pwd + '"')
areainfo = link.send_cmd('get info for ' + site_IP)
for ip in IPlist:
# how to handle 100 IPs in the situstion of 5 accounts avaliable ?
thread = threading.thread(target = Fetchinfo, args = [worker, ip]

Since you don't want calls from the same account id and passwords to happen concurrently, you can define a function that sequentially loops through a sub-list of IPs to fetch synchronously:
def fetch_data_for_ips(account_id, account_password, ips_to_fetch):
results = list()
for ip_to_fetch in ips_to_fetch:
# fetch with the account_id and password synchronously
result = ...
results.append(result)
return results // Added this
Then, use a thread pool, to run the different batches concurrently for each account:
from concurrent.futures import ThreadPoolExecutor, as_completed
# Split the workload for each account to fetch
num, remainder = divmod(len(ip_list), len(accounts))
num_ips_for_each_account = num + bool(remainder)
# This gives e.g. [[1,2,3], [4,5,6]], where each sublist is for each account to fetch
ip_lists_for_each_account = [ip_list[i: i + num_ips_for_each_account] for i in range(0, len(ip_list), num_ips_for_each_account)]
# You should only need number of threads = to the number of accounts you have
with ThreadPoolExecutor(len(accounts)) as executor:
# Feel free to use a set instead if you don't need to know which result came from which thread
futures = dict()
results = list()
for (account_id, account_password), ips_to_fetch in zip(accounts, ip_lists_for_each_account):
future = executor.submit(fetch_data_for_ips, account_id, account_password, ips_to_fetch)
futures[future] = account_id
for future in as_completed(futures):
result = future.result()
account_id = futures[future]
print(f'{account_id} fetched these:', result)
results.extend(result)

you can refer to below sample code as rcshon suggested .
def fetch_data_for_ips(account_id,ips_to_fetch):
results = list()
for ip_to_fetch in ips_to_fetch:
# fetch with the account_id and password synchronously
result = ','.join((account_id,ip_to_fetch))
results.append(result)
return results
from concurrent.futures import ThreadPoolExecutor, as_completed
accounts = ['worker1','worker2','worker3','worker4','worker5']
ip_list = [str(_) for _ in range(10)]
# Split the workload for each account to fetch
num, remainder = divmod(len(ip_list), len(accounts))
num_ips_for_each_account = num + bool(remainder)
# This gives e.g. [[1,2,3], [4,5,6]], where each sublist is for each account to fetch
ip_lists_for_each_account = [ip_list[i: i + num_ips_for_each_account] for i in range(0, len(ip_list), num_ips_for_each_account)]
# You should only need number of threads = to the number of accounts you have
with ThreadPoolExecutor(len(accounts)) as executor:
# Feel free to use a set instead if you don't need to know which result came from which thread
futures = dict()
results = list()
for account_id, ips_to_fetch in zip(accounts, ip_lists_for_each_account):
future = executor.submit(fetch_data_for_ips, account_id, ips_to_fetch)
futures[future] = account_id
for future in as_completed(futures):
result = future.result()
account_id = futures[future]
print(f'{account_id} fetched these:', result)
results.extend(result)
output :
worker3 fetched these: ['worker3,4', 'worker3,5']
worker2 fetched these: ['worker2,2', 'worker2,3']
worker1 fetched these: ['worker1,0', 'worker1,1']
worker4 fetched these: ['worker4,6', 'worker4,7']
worker5 fetched these: ['worker5,8', 'worker5,9']

Related

try to modify before display put in variable, but error print(record['recommendation']) become temp=record['recommendation']

from neo4j import GraphDatabase, basic_auth
driver = GraphDatabase.driver(
"neo4j://34.201.9.108:7687",
auth=basic_auth("neo4j", "chart-certifications-bottom"))
cypher_query = '''
MATCH (m:Movie {title:$movie})<-[:RATED]-(u:User)-[:RATED]->(rec:Movie)
RETURN distinct rec.title AS recommendation LIMIT 20
'''
with driver.session(database="neo4j") as session:
results = session.read_transaction(
lambda tx: tx.run(cypher_query,
movie="Crimson Tide").data())
for record in results:
print(record['recommendation']) #<----------------------- OK
driver.close()
try to modify before display put in variable, but error
print(record['recommendation']) become temp=record['recommendation']
#app.get("/neo4j")
def graph_db():
driver = GraphDatabase.driver(
"neo4j://34.201.9.108:7687",
auth=basic_auth("neo4j", "chart-certifications-bottom"))
cypher_query = '''
MATCH (n:Person) RETURN n LIMIT 25
'''
with driver.session(database="neo4j") as session:
results = session.read_transaction(
lambda tx: tx.run(cypher_query,
movie="Crimson Tide").data())
data=[]
for record in results:
temp=record['recommendation'] #<-----------------------error
data.append(temp)
result=data
driver.close()
return{"Result ":result}
When you replaced your query to
MATCH (n:Person) RETURN n LIMIT 25
You are returning nodes of 25 Person and the column recommendation does not exists.
temp=record['recommendation']
Make sure that the result set that your query is returning is the same with the column name that you are accessing.

Capture 1 resource from a request of many resources: easier approach?

I am relatively new to Python and very new to Simpy. I am trying to build a model of a hospital system that:
Has roughly 20 resources (units) where each resource has a capacity of 5 to 50 (beds)
Not all units can serve the same patients. They have specialties.
When a patient needs a bed, the hospital requests 1 bed from roughly 5 of the 20 units.
So, what I want to do is make a request against multiple Unit resources and only capture 1 bed from 1 available Unit. After many iterations, I think I have found a way of doing this but my approach feels overly complicated.
Below I am showing code using the conditional AnyOf of Simpy. The way AnyOf works is if more than 1 resource has the availability, then more than 1 resource will be captured. So, after the AnyOf request, I release any extra captured beds and cancel any requests still pending.
Is there an easier approach?
from dataclasses import dataclass
import simpy
from simpy.events import AnyOf, Event
import random
#dataclass
class Unit():
env: simpy.Environment()
identifier: str
capacity: int = 1
def __post_init__(self):
self.beds: simpy.Resource = simpy.Resource(env, self.capacity)
def make_request(env, units: list[Unit]):
# create a list of simpy request events for unit 1, 3 and 5
# purpose is to put in a conditional request
any_of_request: list[Event] = []
request_to_unit_dictionary = {}
random_pool_size = random.randint(1, 5)
for x in range(random_pool_size):
random_unit = random.randint(0, 9)
unit = units[random_unit]
res_request = unit.beds.request()
request_to_unit_dictionary[res_request] = unit
any_of_request.append(res_request)
get_one_bed_request = AnyOf(env, any_of_request)
captured_units = yield get_one_bed_request
# how do i determine what unit was captured by the request?
# it is possible that both resources are avaliable
# and both request get filled
captured_requests = list(captured_units.keys())
captured_request = captured_requests[0]
captured_unit: Unit = request_to_unit_dictionary[captured_request]
print(captured_unit)
# if I understand correctly, if I only want 1 request then
# release any "extra" captures
for r in captured_requests:
if r != captured_request:
r.resource.release(r)
# cancel any of the request not captured.
for r in any_of_request:
if r not in captured_requests:
r.cancel()
env = simpy.Environment()
# create 10 units, each with 1 capacity for this example
units: list[Unit] = []
for x in range(1, 10):
units.append(Unit(identifier=f'unit{x}', env=env, capacity=1))
env.process(make_request(env, units))
env.process(make_request(env, units))
env.process(make_request(env, units))
env.process(make_request(env, units))
env.run()
Thanks,
Dan

Peewee-async - How to do a simple JOIN (or subquery / prefetch)

I'm stuck on a pretty simple issue with peewee-async regarding JOINs, or perhaps I need to use a subquery, or prefetch... I can't figure it out what kind of query I need to do.
I have 2 database tables (parent/child):
class Group(PeeweeModel):
id = peewee.AutoField()
name = peewee.TextField()
class Channel(PeeweeModel):
id = peewee.AutoField()
name = peewee.TextField()
group = peewee.ForeignKeyField(Group, backref="channels")
I need to fetch 1 group object, and this object has multiple channel objects.
I tried:
q = Group.select(Group, Channel).join(Channel)
But my backref 'channels' is always a ModelQuery instance, not the actual resultset.
Full code
import asyncio
import peewee
import peewee_async
from peewee_async import Manager, PooledPostgresqlDatabase
database = PooledPostgresqlDatabase('test', max_connections=4, user='postgres', password='', host='127.0.0.1')
objects = peewee_async.Manager(database)
class PeeweeModel(peewee.Model):
class Meta:
database = database
class Group(PeeweeModel):
id = peewee.AutoField()
name = peewee.TextField()
class Channel(PeeweeModel):
id = peewee.AutoField()
name = peewee.TextField()
group = peewee.ForeignKeyField(Group, backref="channels")
Group.create_table()
Channel.create_table()
database.set_allow_sync(False)
async def handler():
# create 1 group object
group = await objects.create(Group, name="TestGroup")
# create 2 channel objects, assign to group
await objects.create(Channel, name="TestName1", group=group)
await objects.create(Channel, name="TestName2", group=group)
# Query 1 group, and hopefully it will have the channels
q = Group.select(Group, Channel).join(Channel)
results = await objects.execute(q)
for result in results:
print(result.channels) # problem: Channels is not a list of channel objects, but a `ModelSelect` instead
with objects.allow_sync():
Channel.drop_table(True)
Group.drop_table(True)
loop = asyncio.get_event_loop()
loop.run_until_complete(handler())
loop.close()
I was able to get help from an expertâ„¢ and the solution is to use prefetch():
async def handler():
# create 1 group object
group = await objects.create(Group, name="TestGroup")
# create 2 channel objects, assign to group
await objects.create(Channel, name="TestName", group=group)
await objects.create(Channel, name="TestName", group=group)
# Query 1 group, and hopefully it will have the channels
q = Group.select(Group)
groups = await objects.prefetch(q, Channel.select(Channel))
for group in groups:
print(group, group.channels) # channels is a list of channels.
with objects.allow_sync():
Channel.drop_table(True)
Group.drop_table(True)
Peewee will figure out the relationship (backref) by itself.

How to get the list of followers from an Instagram account without getting banned?

I am trying to scrape all the followers of some particular Instagram accounts. I am using Python 3.8.3 and the latest version of Instaloader library. The code I have written is given below:
# Import the required libraries:
import instaloader
import time
from random import randint
# Start time:
start = time.time()
# Create an instance of instaloader:
loader = instaloader.Instaloader()
# Credentials & target account:
user_id = USERID
password = PASSWORD
target = TARGET # Account of which the list of followers need to be scraped;
# Login or load the session:
loader.login(user_id, password)
# Obtain the profile metadata of the target:
profile = instaloader.Profile.from_username(loader.context, target)
# Print the list of followers and save it in a text file:
try:
# The list to store the collected user handles of the followers:
followers_list = []
# Variables used to apply pauses to slow down scraping:
count = 0
short_counter = 1
short_pauser = randint(19, 24)
long_counter = 1
long_pauser = randint(4900, 5000)
# Fetch the followers one by one:
for follower in profile.get_followers():
sleeper = randint(840, 1020)
# Short pause for the process:
if (short_counter % short_pauser == 0):
short_counter = 0
short_pauser = randint(19, 24)
print('\nShort Pause.\n')
time.sleep(1)
# Long pause for the process:
if (long_counter % long_pauser == 0):
long_counter = 0
long_pauser = randint(4900, 5000)
print('\nLong pause.\n')
time.sleep(sleeper)
# Append the list and print the follower's user handle:
followers_list.append(follower.username)
print(count,'', followers_list[count])
# Increment the counters accordingly:
count = count + 1
short_counter = short_counter + 1
long_counter = long_counter + 1
# Store the followers list in a txt file:
txt_file = target + '.txt'
with open(txt_file, 'a+') as f:
for the_follower in followers_list:
f.write(the_follower)
f.write('\n')
except Exception as e:
print(e)
# End time:
end = time.time()
total_time = end - start
# Print the time taken for execution:
print('Time taken for complete execution:', total_time,'s.')
I am getting the following error after scraping some data:
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
400 Bad Request
In fact, the error occurs when Instagram detects unusual activity and disables the account for a while and prompts the user to change the password.
I have tried -
(1) Slowing down the process of scraping.
(2) Adding pauses in between in order to make the program more human-like.
Still, no progress.
How to bypass such restrictions and get the complete list of all the followers?
If getting the entire list is not possible, what is the best way to get at least 20,000 followers list (from multiple accounts) without getting banned / disabled account / facing such inconveniences?

Loop not looping through all elements in list

I have
print(len(pcb.resources))
for res in pcb.resources:
print ("Releasing resource: " + res.name)
self.releaseResource(res.name, pcb)
print("After: " + str(len(pcb.resources)))
Which outputs
2 # from 1st print: this is correct, I have 2 elements in list
Releasing resource: R1 # is it not looping through the 2nd element? Notice the len() is 2
After: 1 # from print after loop. I am expecting 0
UPDATE
I notice its something to do with the function call. releaseResource. But how might releaseResource be affecting the calling loop?
def releaseResource(self, name, pcb = None):
callScheduler = pcb is not None # if pcb is set, means calling from release resource, so dont call scheduler 1st
if pcb is None:
pcb = self.running
# check if resource exists
if not any(rcb.name == name for rcb in self.resources):
return False
rcb = next(r for r in self.resources if r.name == name)
# remove resource from running pcb's resources
pcb.resources.remove(rcb)
if len(rcb.waitingList) == 0:
# no waiting processes: resource is free
rcb.status = RCB.STATUS_FREE
rcb.heldBy = None
else:
# make dequeue from resource waiting list
pcb = rcb.waitingList.popleft()
# put resource into process resources list
pcb.resources.append(rcb)
rcb.heldBy = pcb
# make it ready
pcb.status = PCB.STATUS_READY
pcb.statusList = self.readyList
self.readyList.enqueue(pcb)
# call scheduler
if callScheduler:
self.scheduler()
return True
It looks to me like you modify the list in releaseResource you iterate over, namely in the line pcb.resources.remove(rcb). Try this instead:
for res in pcb.resources[:]:
print ("Releasing resource: " + res.name)
self.releaseResource(res.name, pcb)
See the documentation.

Resources