How to correctly update a map field in firestore using python? - python-3.x

I have a map in firestore and I want to regularly update it (not overwrite previous keys). Most of the times it works, however sometimes it fails and it does not throw any exception. The only indication that something went wrong is that the result (https://cloud.google.com/firestore/docs/reference/rest/v1/WriteResult) has an update_time which I can compare to now() and if the difference is too large I know it did not do an update now. The problem is after that the whole map is missing (all previous keys are gone). So not only it failed to add the current keys but somehow it wiped out the whole field.
Below is the full code:
error_keys = []
for key in data.keys():
# continue if set is empty
if not data[key]:
continue
try:
new_keys = {
f'myMap.{k}': v for k, v in data[key].items()}
result = self.db.collection(u'myCollection').document(key).update(
new_keys
)
now = datetime.now(tz=pytz.utc)
dt_string = now.strftime("%d/%m/%Y %H:%M:%S.%fZ")
duration = now - result.update_time #
duration_in_s = duration.total_seconds()
minutes = divmod(duration_in_s, 60)[0]
if minutes > 1.0:
logger.warning("Diff to update time is larger that 1 min")
logger.info(f'Now: {dt_string}')
logger.info(f'Duration in minutes: {minutes}')
logger.info(f'Adding {key} to error_keys')
error_keys.append(key)
logger.info(f'KEY: {key}: {data[key]} Update_time: {result.update_time} Diff_minutes: {minutes}')
except:
logger.warning(
f'Could not key: {key} with data {data[key]} to firebase.')
error_keys.append(key)
logger.exception('Exception writing keys')
logger.info(f'ERROR_KEYS: {error_keys}')
return error_keys
I am using:
google-cloud-firestore 2.1.0
Python 3.7.3

Related

Random key lookup on LMDB/python vs BerkeleyBD/python (How to make LMDB lookup faster)

I have this program written in python that uses berkeleydb to store data (event logs) which i migrated to lmdb. My problem is, before an event gets written, the program does a lookup if the event already exists. I noticed that the berkeleydb version is much faster in doing the single value lookup using 13k+ records (as if the lmdb version is 1 second slower for every lookup) even with transactions enabled in berkeleydb. Any idea how to speed up the lmdb version? Note that I've had 70gb+ (about 30 million records) worth of data already stored in my berkeleydb and doing additional processing on those events takes me more than an hour so I thought switching to lmdb would decrease the processing time.
My LMDB environment was opened this way (I event set the readahead to False (but the database size is just about 35mb so I don't think it matters):
env = lmdb.open(db_folder, map_size=100000000000, max_dbs=4, readahead=False)
database = env.open_db('events'.encode())
My berkeleydb was opened this way:
env = db.DBEnv()
env.open(db_folder, db.DB_INIT_MPOOL | db.DB_CREATE | db.DB_INIT_LOG | db.DB_INIT_TXN | db.DB_RECOVER, 0)
database = db.DB(env)
BerkeleyDB version of check:
if event['eId'].encode('utf-8') in database:
duplicate_count += 1
else:
try:
txn = env.txn_begin(None)
database[event['eId'].encode('utf-8')] = json.dumps(event).encode('utf-8')
except:
if txn is not None:
txn.abort()
txn = None
raise
else:
txn.commit()
txn = None
event_count += 1
lmdb version:
with env.begin(buffers=True, db=database) as txn:
if (txn.get(event['eId'].encode()) is not None):
dup_event_count += 1
else:
txn.put(event['eId'].encode(), json.dumps(event).encode('utf-8'))
event_count += 1
Solution:
Place with env.begin outside the loop:
#case('rand lookup')
def test():
with env.begin() as txn:
for word in words:
txn.get(word)
return len(words)
#case('per txn rand lookup')
def test():
for word in words:
with env.begin() as txn:
txn.get(word)
return len(words)
Figured this out myself. What I'm doing is a per transaction random lookup. I just had to place with env.begin outside of the for loop (not visible in my example) as suggested in this example: https://raw.githubusercontent.com/jnwatson/py-lmdb/master/examples/dirtybench.py

try to modify before display put in variable, but error print(record['recommendation']) become temp=record['recommendation']

from neo4j import GraphDatabase, basic_auth
driver = GraphDatabase.driver(
"neo4j://34.201.9.108:7687",
auth=basic_auth("neo4j", "chart-certifications-bottom"))
cypher_query = '''
MATCH (m:Movie {title:$movie})<-[:RATED]-(u:User)-[:RATED]->(rec:Movie)
RETURN distinct rec.title AS recommendation LIMIT 20
'''
with driver.session(database="neo4j") as session:
results = session.read_transaction(
lambda tx: tx.run(cypher_query,
movie="Crimson Tide").data())
for record in results:
print(record['recommendation']) #<----------------------- OK
driver.close()
try to modify before display put in variable, but error
print(record['recommendation']) become temp=record['recommendation']
#app.get("/neo4j")
def graph_db():
driver = GraphDatabase.driver(
"neo4j://34.201.9.108:7687",
auth=basic_auth("neo4j", "chart-certifications-bottom"))
cypher_query = '''
MATCH (n:Person) RETURN n LIMIT 25
'''
with driver.session(database="neo4j") as session:
results = session.read_transaction(
lambda tx: tx.run(cypher_query,
movie="Crimson Tide").data())
data=[]
for record in results:
temp=record['recommendation'] #<-----------------------error
data.append(temp)
result=data
driver.close()
return{"Result ":result}
When you replaced your query to
MATCH (n:Person) RETURN n LIMIT 25
You are returning nodes of 25 Person and the column recommendation does not exists.
temp=record['recommendation']
Make sure that the result set that your query is returning is the same with the column name that you are accessing.

How to get the list of followers from an Instagram account without getting banned?

I am trying to scrape all the followers of some particular Instagram accounts. I am using Python 3.8.3 and the latest version of Instaloader library. The code I have written is given below:
# Import the required libraries:
import instaloader
import time
from random import randint
# Start time:
start = time.time()
# Create an instance of instaloader:
loader = instaloader.Instaloader()
# Credentials & target account:
user_id = USERID
password = PASSWORD
target = TARGET # Account of which the list of followers need to be scraped;
# Login or load the session:
loader.login(user_id, password)
# Obtain the profile metadata of the target:
profile = instaloader.Profile.from_username(loader.context, target)
# Print the list of followers and save it in a text file:
try:
# The list to store the collected user handles of the followers:
followers_list = []
# Variables used to apply pauses to slow down scraping:
count = 0
short_counter = 1
short_pauser = randint(19, 24)
long_counter = 1
long_pauser = randint(4900, 5000)
# Fetch the followers one by one:
for follower in profile.get_followers():
sleeper = randint(840, 1020)
# Short pause for the process:
if (short_counter % short_pauser == 0):
short_counter = 0
short_pauser = randint(19, 24)
print('\nShort Pause.\n')
time.sleep(1)
# Long pause for the process:
if (long_counter % long_pauser == 0):
long_counter = 0
long_pauser = randint(4900, 5000)
print('\nLong pause.\n')
time.sleep(sleeper)
# Append the list and print the follower's user handle:
followers_list.append(follower.username)
print(count,'', followers_list[count])
# Increment the counters accordingly:
count = count + 1
short_counter = short_counter + 1
long_counter = long_counter + 1
# Store the followers list in a txt file:
txt_file = target + '.txt'
with open(txt_file, 'a+') as f:
for the_follower in followers_list:
f.write(the_follower)
f.write('\n')
except Exception as e:
print(e)
# End time:
end = time.time()
total_time = end - start
# Print the time taken for execution:
print('Time taken for complete execution:', total_time,'s.')
I am getting the following error after scraping some data:
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
400 Bad Request
In fact, the error occurs when Instagram detects unusual activity and disables the account for a while and prompts the user to change the password.
I have tried -
(1) Slowing down the process of scraping.
(2) Adding pauses in between in order to make the program more human-like.
Still, no progress.
How to bypass such restrictions and get the complete list of all the followers?
If getting the entire list is not possible, what is the best way to get at least 20,000 followers list (from multiple accounts) without getting banned / disabled account / facing such inconveniences?

Max Aggregation with Hazelcast-jet

I want to do a simple max across an entire dataset. I started with the Kafka example at: https://github.com/hazelcast/hazelcast-jet-code-samples/blob/0.7-maintenance/kafka/src/main/java/avro/KafkaAvroSource.java
I just changed the pipeline to:
p.drawFrom(KafkaSources.<Integer, User>kafka(brokerProperties(), TOPIC))
.map(Map.Entry::getValue)
.rollingAggregate(minBy(comparingInt(user -> (Integer) user.get(2))))
.map(user -> (Integer) user.get(2))
.drainTo(Sinks.list("result"));
and the go to:
IListJet<Integer> res = jet.getList("result");
SECONDS.sleep(10);
System.out.println(res.get(0));
SECONDS.sleep(15);
System.out.println(res.get(0));
cancel(job);
to get the largest age of people in the topic. It however doesn't return 20 and seems to return different values on different runs. Any idea why?
You seem to be using rollingAggregate, which produces a new output item every time it receives some input, but all you check is the first item it emitted. You must instead find the latest item it emitted. One way to achieve it is by pushing the result into an IMap sink, using the same key every time:
p.drawFrom(KafkaSources.<Integer, User>kafka(brokerProperties(), TOPIC))
.withoutTimestamps()
.map(Map.Entry::getValue)
.rollingAggregate(minBy(comparingInt(user -> (Integer) user.get(2))))
.map(user -> entry("user", (Integer) user.get(2)))
.drainTo(Sinks.map("result"));
You can fetch the latest result with
IMap<String, Integer> result = jet.getMap("result");
System.out.println(result.get("user");

Python - Error querying Solarwinds N-Central via SOAP

I'm using python 3 to write a script that generates a customer report for Solarwinds N-Central. The script uses SOAP to query N-Central and I'm using zeep for this project. While not new to python I am new to SOAP.
When calling the CustomerList fuction I'm getting the TypeError: __init__() got an unexpected keyword argument 'listSOs'
import zeep
wsdl = 'http://' + <server url> + '/dms/services/ServerEI?wsdl'
client = zeep.CachingClient(wsdl=wsdl)
config = {'listSOs': 'true'}
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass, Settings=config)
Per the perameters below 'listSOs' is not only a valid keyword, its the only one accepted.
CustomerList
public com.nable.nobj.ei.Customer[] CustomerList(String username, String password, com.nable.nobj.ei.T_KeyPair[] settings) throws RemoteException
Parameters:
username - MSP N-central username
password - Corresponding MSP N-central password
settings - A list of non default settings stored in a T_KeyPair[]. Below is a list of the acceptable Keys and Values. If not used leave null
(Key) listSOs - (Value) "true" or "false". If true only SOs with be shown, if false only customers and sites will be shown. Default value is false.
I've also tried passing the dictionary as part of a list:
config = []
key = {'listSOs': 'true'}
config += key
TypeError: Any element received object of type 'str', expected lxml.etree._Element or builtins.dict or zeep.objects.T_KeyPair
Omitting the Settings value entirely:
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass)
zeep.exceptions.ValidationError: Missing element Settings (CustomerList.Settings)
And trying zeep's SkipValue:
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass, Settings=zeep.xsd.SkipValue)
zeep.exceptions.Fault: java.lang.NullPointerException
I'm probably missing something simple but I've been banging my head against the wall off and on this for awhile I'm hoping someone can point me in the right direction.
Here's my source code from my getAssets.py script. I did it in Python2.7, easily upgradeable though. Hope it helps someone else, N-central's API documentation is really bad lol.
#pip2.7 install zeep
import zeep, sys, csv, copy
from zeep import helpers
api_username = 'your_ncentral_api_user'
api_password='your_ncentral_api_user_pw'
wsdl = 'https://(yourdomain|tenant)/dms2/services2/ServerEI2?wsdl'
client = zeep.CachingClient(wsdl=wsdl)
response = client.service.deviceList(
username=api_username,
password=api_password,
settings=
{
'key': 'customerId',
'value': 1
}
)
# If you can't tell yet, I code sloppy
devices_list = []
device_dict = {}
dev_inc = 0
max_dict_keys = 0
final_keys = []
for device in response:
# Iterate through all device nodes
for device_properties in device.items:
# Iterate through each device's properties and add it to a dict (keyed array)
device_dict[device_properties.first]=device_properties.second
# Dig further into device properties
device_properties = client.service.devicePropertyList(
username=api_username,
password=api_password,
deviceIDs=device_dict['device.deviceid'],
reverseOrder=False
)
prop_ind = 0 # This is a hacky thing I did to make my CSV writing work
for device_node in device_properties:
for prop_tree in device_node.properties:
for key, value in helpers.serialize_object(prop_tree).items():
prop_ind+=1
device_dict["prop" + str(prop_ind) + "_" + str(key)]=str(value)
# Append the dict to a list (array), giving us a multi dimensional array, you need to do deep copy, as .copy will act like a pointer
devices_list.append(copy.deepcopy(device_dict))
# check to see the amount of keys in the last item
if len(devices_list[-1].keys()) > max_dict_keys:
max_dict_keys = len(devices_list[-1].keys())
final_keys = devices_list[-1].keys()
print "Gathered all the datas of N-central devices count: ",len(devices_list)
# Write the data out to a CSV
with open('output.csv', 'w') as csvfile:
fieldnames = final_keys
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for csv_line in devices_list:
writer.writerow(csv_line)

Resources