pandas aggregation based on timestamp threshold - python-3.x

I hope somebody can help me to solve this issue.
I have a csv file structured as follow:
I am trying to group the events based on message, name, userID if the events manifests in a 10min threshold starting from the first event matched.
the output I am expecting from the csv, is to see only 3 rows, because the second and third (as they are in 10min threshold and the message and name and ID are the same, they should be grouped) and have an extra columns name event_count that report how many time that event occurred.like this
I start working on this and my script looks like this:
import csv
import pandas as pd
# 0. sort data by timestamp if not already sorted
file_csv = 'test.csv'
f = pd.read_csv(file_csv)
f['#timestamp'] = pd.to_datetime(f['#timestamp'])
f = f.sort_values('#timestamp')
# lazy groupby
groups = f.groupby(['message','name','userID'])
# 1. compute the time differences `timediff` and compare to threshold
f['timediff'] = groups['#timestamp'].diff() < pd.Timedelta(minutes=10)
# 2. find the blocks with cumsum
f['event_count'] = groups['timediff'].cumsum()
# 3. groupby the blocks
out = (f.groupby(['message','name', 'userID'])
.agg({'#timestamp':'first', 'timediff':'count'})
)
keep_col = ['#timestamp', 'message', 'name', 'userID', 'event_count']
new_f = f[keep_col]
new_f.to_csv("aggregationtest.csv", index=False)
But the aggregation is totally wrong because is grouping all the event together even if they don't fall in the 10min threshold.
I am really struggling to understand what I am doing wrong if somebody can help me to understand the issue
UPDATE:
After some testing I managed to get a closer output to what I am expecting but still wrong.
I did some updated on the out variable as follow
out = (f.groupby(['message','name', 'userID', 'timediff']).agg({'#timestamp':'first','message': 'unique','name': 'unique', 'userID': 'unique', 'timediff': 'count'}))
This bit of code now produce an output that looks like:
But even if its grouping now, the count is wrong. Having this csv file
#timestamp,message,name,userID
2021-07-13 21:36:18,Failed to download file,Failed to download file,admin
2021-07-14 03:46:16,Successful Logon for user "user1",Logon Attempt,1
2021-07-14 03:51:16,Successful Logon for user "user1",Logon Attempt,1
2021-07-14 03:54:16,Successful Logon for user "user1",Logon Attempt,1
2021-07-14 04:55:16,Successful Logon for user "user1",Logon Attempt,1
I am expecting to have the following event_count
1
3
1
But I am getting different out come.

You'll have to somehow identify the different periods within the groups. The solution below gives each period within the group a name, which can then be included in the groupby that generates the count:
import pandas as pd
file_csv = 'test.csv'
f = pd.read_csv(file_csv)
f['#timestamp'] = pd.to_datetime(f['#timestamp'])
f = f.sort_values('#timestamp')
def check(item): #taken from https://stackoverflow.com/a/53189777/11380795
diffs = item - item.shift()
laps = diffs > pd.Timedelta('10 min')
periods = laps.cumsum().apply(lambda x: 'period_{}'.format(x+1))
return periods
#create period names
f['period'] = f.groupby(['message','name','userID'])['#timestamp'].transform(check)
#groupby and count
(f.groupby(['message','name', 'userID', 'period']).agg({'#timestamp':'first', 'period': 'count'})).rename(columns={"period": "timediff"}).reset_index()
Output:
message
name
userID
period
#timestamp
timediff
0
Failed to download file
Failed to download file
admin
period_1
2021-07-13 21:36:18
1
1
Successful Logon for user "user1"
Logon Attempt
1
period_1
2021-07-14 03:46:16
3
2
Successful Logon for user "user1"
Logon Attempt
1
period_2
2021-07-14 04:55:16
1

Related

How to correctly update a map field in firestore using python?

I have a map in firestore and I want to regularly update it (not overwrite previous keys). Most of the times it works, however sometimes it fails and it does not throw any exception. The only indication that something went wrong is that the result (https://cloud.google.com/firestore/docs/reference/rest/v1/WriteResult) has an update_time which I can compare to now() and if the difference is too large I know it did not do an update now. The problem is after that the whole map is missing (all previous keys are gone). So not only it failed to add the current keys but somehow it wiped out the whole field.
Below is the full code:
error_keys = []
for key in data.keys():
# continue if set is empty
if not data[key]:
continue
try:
new_keys = {
f'myMap.{k}': v for k, v in data[key].items()}
result = self.db.collection(u'myCollection').document(key).update(
new_keys
)
now = datetime.now(tz=pytz.utc)
dt_string = now.strftime("%d/%m/%Y %H:%M:%S.%fZ")
duration = now - result.update_time #
duration_in_s = duration.total_seconds()
minutes = divmod(duration_in_s, 60)[0]
if minutes > 1.0:
logger.warning("Diff to update time is larger that 1 min")
logger.info(f'Now: {dt_string}')
logger.info(f'Duration in minutes: {minutes}')
logger.info(f'Adding {key} to error_keys')
error_keys.append(key)
logger.info(f'KEY: {key}: {data[key]} Update_time: {result.update_time} Diff_minutes: {minutes}')
except:
logger.warning(
f'Could not key: {key} with data {data[key]} to firebase.')
error_keys.append(key)
logger.exception('Exception writing keys')
logger.info(f'ERROR_KEYS: {error_keys}')
return error_keys
I am using:
google-cloud-firestore 2.1.0
Python 3.7.3

How to get the list of followers from an Instagram account without getting banned?

I am trying to scrape all the followers of some particular Instagram accounts. I am using Python 3.8.3 and the latest version of Instaloader library. The code I have written is given below:
# Import the required libraries:
import instaloader
import time
from random import randint
# Start time:
start = time.time()
# Create an instance of instaloader:
loader = instaloader.Instaloader()
# Credentials & target account:
user_id = USERID
password = PASSWORD
target = TARGET # Account of which the list of followers need to be scraped;
# Login or load the session:
loader.login(user_id, password)
# Obtain the profile metadata of the target:
profile = instaloader.Profile.from_username(loader.context, target)
# Print the list of followers and save it in a text file:
try:
# The list to store the collected user handles of the followers:
followers_list = []
# Variables used to apply pauses to slow down scraping:
count = 0
short_counter = 1
short_pauser = randint(19, 24)
long_counter = 1
long_pauser = randint(4900, 5000)
# Fetch the followers one by one:
for follower in profile.get_followers():
sleeper = randint(840, 1020)
# Short pause for the process:
if (short_counter % short_pauser == 0):
short_counter = 0
short_pauser = randint(19, 24)
print('\nShort Pause.\n')
time.sleep(1)
# Long pause for the process:
if (long_counter % long_pauser == 0):
long_counter = 0
long_pauser = randint(4900, 5000)
print('\nLong pause.\n')
time.sleep(sleeper)
# Append the list and print the follower's user handle:
followers_list.append(follower.username)
print(count,'', followers_list[count])
# Increment the counters accordingly:
count = count + 1
short_counter = short_counter + 1
long_counter = long_counter + 1
# Store the followers list in a txt file:
txt_file = target + '.txt'
with open(txt_file, 'a+') as f:
for the_follower in followers_list:
f.write(the_follower)
f.write('\n')
except Exception as e:
print(e)
# End time:
end = time.time()
total_time = end - start
# Print the time taken for execution:
print('Time taken for complete execution:', total_time,'s.')
I am getting the following error after scraping some data:
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
400 Bad Request
In fact, the error occurs when Instagram detects unusual activity and disables the account for a while and prompts the user to change the password.
I have tried -
(1) Slowing down the process of scraping.
(2) Adding pauses in between in order to make the program more human-like.
Still, no progress.
How to bypass such restrictions and get the complete list of all the followers?
If getting the entire list is not possible, what is the best way to get at least 20,000 followers list (from multiple accounts) without getting banned / disabled account / facing such inconveniences?

Dump series back into InfluxDB after querying with replaced field value

Scenario
I want to send data to an MQTT Broker (Cloud) by querying measurements from InfluxDB.
I have a field in the schema which is called status. It can either be 1 or 0. status=0 indicated that series has not been sent to the cloud. If I get an acknowlegdment from the MQTT Broker then I wish to rewrite the query back into the database with status=1.
As mentioned in FAQs for InfluxDB regarding Duplicate data If the information has the same timestamp as the previous query but with a different field value => then the update field will be shown.
In order to test this I created the following:
CREATE DATABASE dummy
USE dummy
INSERT meas_1, type=t1, status=0,value=123 1536157064275338300
query:
SELECT * FROM meas_1
provides
time status type value
1536157064275338300 0 t1 234
now if I want to overwrite the series I do the following:
INSERT meas_1, type=t1, status=1,value=123 1536157064275338300
which will overwrite the series
time status type value
1536157064275338300 1 t1 234
(Note: this is not possible via Tags currently in InfluxDB)
Usage
Query some information using the client with "status"=0.
Restructure JSON to be sent to the cloud
Send the information to cloud
If successful then write the output from Step 1. back into the DB but with status=1.
I am using the InfluxDBClient Python3 to create the Application (MQTT + InfluxDB)
Within the write_points API there is a parameter which mentions batch_size which require int as input.
I am not sure how can I use this with the Application that I want. Can someone guide me with this or with the Schema of the DB so that I can upload actual and non-redundant information to the cloud ?
The batch_size is actually the length of the list of the measurements that needs to passed to write_points.
Steps
Create client and query from measurement (here, we query gps information)
client = InfluxDBClient(database='dummy')
op = client.query('SELECT * FROM gps WHERE "status"=0', epoch='ns')
Make the ResultSet into a list:
batch = list(op.get_points('gps'))
create an empty list for update
updated_batch = []
parse through each measurement and change the status flag to 1. Note, default values in InfluxDB are float
for each in batch:
new_mes = {
'measurement': 'gps',
'tags': {
'type': 'gps'
},
'time': each['time'],
'fields': {
'lat': float(each['lat']),
'lon': float(each['lon']),
'alt': float(each['alt']),
'status': float(1)
}
}
updated_batch.append(new_mes)
Finally dump the points back via the client with batch_size as the length of the updated_batch
client.write_points(updated_batch, batch_size=len(updated_batch))
This overwrites the series because it contains the same timestamps with status field set to 1

check availability of meeting rooms in outlook with python

I am working on to write a python script to check if particular meeting room is available. If yes then meeting room will be booked, if not then python will find available time slot for that day.
For now, I have achieved to book meeting room but I am not able to check availability of rooms.
To book any meeting room, i have to send mail to that book meeting room configured mail id and corresponding acceptance/decline mail I receive as per the availability.
below is the snippet :
import win32com.client
import datetime
import pywintypes
oOutlook = win32com.client.Dispatch("Outlook.Application")
appt = oOutlook.CreateItem(1)
appt.Start = '2018-05-18 13:30'
appt.Subject = 'Follow Up Meeting'
appt.Duration = 30
appt.Location = '<name of meeting room>'
appt.MeetingStatus = 1
myRecipient = appt.Recipients.Add("<mail id of meeting room")
myRecipient.resolve
my_date = datetime.date(2018,5,18)
pywintypeDate = pywintypes.Time (my_date)
availabilityInfo = myRecipient.FreeBusy(pywintypeDate,30,True)
print(availabilityInfo)
# appt.Save()
# appt.Send()
# print("done")
output is :
000000000000000000000222222200222222022000000000000000000000000000000002222222222220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000022000000000000000000000000000000000000000002220002222200000000000000000000000000000000002220022022222000000000000000000000000000000000000000000002222000000000000000000000000000000000000220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000022000000000000000000000000000000000000000000000222222200000000000000000000000000000000002220000022000000000000000000000000000000000000002220000222222000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000022000000000000000000000000000000000000000002220022022200000000000000000000000000000000000022000022000000000000000000000000000000000000000000000002222000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000022000000000000000000000000000000000000000000000022022200000000000000000000000000000000002220002222000000000000000
so is it first byte (0) indicate time slot from 00:00 to 00:30 and soon for one complete month ?
Is it possible to get output only for one day ?
Do i have to parse the above output to check availability for my particular required time ?
appt.Recipients.Add returns the Recipient object. Resolve it first (Recipient.Resolve), then call Recipient.FreeBusy.

Python - Error querying Solarwinds N-Central via SOAP

I'm using python 3 to write a script that generates a customer report for Solarwinds N-Central. The script uses SOAP to query N-Central and I'm using zeep for this project. While not new to python I am new to SOAP.
When calling the CustomerList fuction I'm getting the TypeError: __init__() got an unexpected keyword argument 'listSOs'
import zeep
wsdl = 'http://' + <server url> + '/dms/services/ServerEI?wsdl'
client = zeep.CachingClient(wsdl=wsdl)
config = {'listSOs': 'true'}
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass, Settings=config)
Per the perameters below 'listSOs' is not only a valid keyword, its the only one accepted.
CustomerList
public com.nable.nobj.ei.Customer[] CustomerList(String username, String password, com.nable.nobj.ei.T_KeyPair[] settings) throws RemoteException
Parameters:
username - MSP N-central username
password - Corresponding MSP N-central password
settings - A list of non default settings stored in a T_KeyPair[]. Below is a list of the acceptable Keys and Values. If not used leave null
(Key) listSOs - (Value) "true" or "false". If true only SOs with be shown, if false only customers and sites will be shown. Default value is false.
I've also tried passing the dictionary as part of a list:
config = []
key = {'listSOs': 'true'}
config += key
TypeError: Any element received object of type 'str', expected lxml.etree._Element or builtins.dict or zeep.objects.T_KeyPair
Omitting the Settings value entirely:
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass)
zeep.exceptions.ValidationError: Missing element Settings (CustomerList.Settings)
And trying zeep's SkipValue:
customers = client.service.CustomerList(Username=nc_user, Password=nc_pass, Settings=zeep.xsd.SkipValue)
zeep.exceptions.Fault: java.lang.NullPointerException
I'm probably missing something simple but I've been banging my head against the wall off and on this for awhile I'm hoping someone can point me in the right direction.
Here's my source code from my getAssets.py script. I did it in Python2.7, easily upgradeable though. Hope it helps someone else, N-central's API documentation is really bad lol.
#pip2.7 install zeep
import zeep, sys, csv, copy
from zeep import helpers
api_username = 'your_ncentral_api_user'
api_password='your_ncentral_api_user_pw'
wsdl = 'https://(yourdomain|tenant)/dms2/services2/ServerEI2?wsdl'
client = zeep.CachingClient(wsdl=wsdl)
response = client.service.deviceList(
username=api_username,
password=api_password,
settings=
{
'key': 'customerId',
'value': 1
}
)
# If you can't tell yet, I code sloppy
devices_list = []
device_dict = {}
dev_inc = 0
max_dict_keys = 0
final_keys = []
for device in response:
# Iterate through all device nodes
for device_properties in device.items:
# Iterate through each device's properties and add it to a dict (keyed array)
device_dict[device_properties.first]=device_properties.second
# Dig further into device properties
device_properties = client.service.devicePropertyList(
username=api_username,
password=api_password,
deviceIDs=device_dict['device.deviceid'],
reverseOrder=False
)
prop_ind = 0 # This is a hacky thing I did to make my CSV writing work
for device_node in device_properties:
for prop_tree in device_node.properties:
for key, value in helpers.serialize_object(prop_tree).items():
prop_ind+=1
device_dict["prop" + str(prop_ind) + "_" + str(key)]=str(value)
# Append the dict to a list (array), giving us a multi dimensional array, you need to do deep copy, as .copy will act like a pointer
devices_list.append(copy.deepcopy(device_dict))
# check to see the amount of keys in the last item
if len(devices_list[-1].keys()) > max_dict_keys:
max_dict_keys = len(devices_list[-1].keys())
final_keys = devices_list[-1].keys()
print "Gathered all the datas of N-central devices count: ",len(devices_list)
# Write the data out to a CSV
with open('output.csv', 'w') as csvfile:
fieldnames = final_keys
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for csv_line in devices_list:
writer.writerow(csv_line)

Resources