So i realized after printing my data to json that my api call isnt finishing, how can i go about creating a count down timer? I am using the cursor value which doesnt really change and i cannot use the rel links, if im able to get the totalPages, can i count down based off the variable or is there a better way to end the python api requests?
my current method doesnt really work since there's always a cursor, but it works as far as never ending the while loop
i thought i could do count = totalpage -1 but that isnt working either, this is what i have so far
from curses import raw, reset_shell_mode
from http import server
from turtle import pd
import requests, sys
import urllib3
from pathlib import Path
import json
from datetime import datetime
from requests import Session
from typing import Any, List, Tuple, Dict
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
from collections import Counter
# disable urllib3 warnings for SSL
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
sys.path.append('/Users/s153152/Documents/Workspace/kpi/rapid7')
def _headers():
headers = {
'X-Api-Key': 'key',
'Content-type':'application/json',
'Accept':'*/*'
}
return headers
def _request():
third_party_patching_filer = {
"asset": "asset.tags IN ['osswin'] && asset.os.vendor CONTAINS 'microsoft' && asset.agentKey IS NOT NULL",
"vulnerability" : "vulnerability.categories NOT IN ['microsoft patch']"}
headers = _headers()
print(headers)
url1 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets?&size=50"
resp = requests.post(url=url1, headers=headers, json=third_party_patching_filer, verify=False).json()
has_next_cursor = True
nextKey = ""
totalpage = resp["metadata"]
print(totalpage)
total_server_ip_addresses = []
osname_server1 = []
total_critical_vul_osswin = []
results = []
with requests.Session() as session:
url2 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets?&size=50&cursor={nextKey}"
while has_next_cursor:
s = requests.Session()
backoff_factor=0.3
retries = Retry(total=10, backoff_factor=backoff_factor, status_forcelist=[ 502, 503, 504 ])
s.mount(url2, HTTPAdapter(max_retries=retries))
s = s.post(url=url2, headers=headers, json=third_party_patching_filer, verify=False)
print(s.status_code)
s = s.json()
rel_last = s["metadata"]
cursor = s["metadata"]
if "cursor" in cursor:
nextKey = cursor["cursor"]
totalResources = cursor["totalResources"]
count = totalpage -1
print(f"next key {nextKey}, total resourcs {totalResources}")
print(f"{count}")
#print(desktop_support)
for data in s["data"]:
for tags in data['tags']:
if tags["name"] == 'OSSWIN':
try:
osname_server = data['host_name']
server_host_ip_address = data['ip']
critical_vuln_osswin = data['critical_vulnerabilities']
os_type = data['os_type']
data = {
'ip_address': server_host_ip_address,
'os_name': osname_server,
'critical_vuln': critical_vuln_osswin,
'os_type': os_type
}
results.append(data)
except Exception as e:
f"Possibly no ip address, error {e}"
#print(f"Server asset: {osname_server1}, {total_server_ip_addresses}")
with open('server_info.json', 'w') as f:
json.dump(results, f, indent=2)
#print(data['host_name'])
else:
has_next_cursor = False
return total_server_ip_addresses, sum(total_server_ip_addresses)
def smart_filter():
test = _request()
print(test)
smart_filter()
count down from variable to finish the api call
Related
I have created a simple thread request code:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
def doScrape(response):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
return {
'url': response.url,
'repository_results': t.text.strip()
}
def doRequest(url):
response = requests.get(url)
time.sleep(random.randint(1, 3))
return response
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url
) for url in URLS]
for future in as_completed(future_tasks):
response = future.result()
if response.status_code == 200:
result = doScrape(response)
print(result)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
where I first start a thread with a ThreadPoolExecutor that has a workers of 2. The idea of this is that I want to be able to monitor 24/7 and notify myself whenever there has been a change (in this case if the repository_results is different between previous requests vs. latest request) - whenever there is a change, I want to print out that says that there is a difference.
I wonder how I am able to do that using ThreadPoolExecutor and how I can monitor a specific url to see if there has happend a change nor not?
You can do this by storing the previous results in the list itself, and passing that along with the response to doScrape
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
result = {'url': url_dict['url'], 'respository_results': None, 'change': False}
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
result['change'] = True
result['respository_results'] = current_response
return result
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
result = doScrape(response, url_dict)
print(result)
if result['change']:
print(f'Changed for url : {result["url"]}!')
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
The only exception where this fails is if the change happened at the very first time you are running the loop, since we would not know the previous value of the scraped element.
Also, if you are planning to run this on loop and only want to print in case their is a change, make sure to change the repository_result key in the url_dict itself (inside doScrape), and you can omit the return results line as well:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
print(f'Changed for url : {url_dict["url"]}')
url_dict['respository_results'] = current_response
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
doScrape(response, url_dict)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
In this code I want to extract content from a newspaper link using beautifulsoup. But it is not working properly, each link in the list "filtered_Final_LIST" has links which has multiple articles. The function 'ext_url' is not returning all the pages results when I am using concurrent library.
And, Normal for loop is working properly. I have used this concurrent library to increase extraction speed. Am I doing something wrong?
import concurrent.futures
import time
MAX_THREADS = 30
filtered_Final_LIST = ['https://www.financialexpress.com/economy/finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/"',
'https://www.financialexpress.com/economy/uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/"',
'https://www.financialexpress.com/economy/economic-recovery-yet-to-attain-durability-says-report/2410690/"',
'https://www.financialexpress.com/economy/vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/"']
def ext_url(url):
global List_articles, List_header, List_date, List_month, List_year, List_source
## Lists to get dates and news articles
List_articles = []
List_header = []
List_date = []
List_month = []
List_year = []
List_source = []
# for i in range(len(filtered_Final_LIST)):
# if 'https://www.financialexpress.com/economy/' in str(Final_LIST[i]):
# opening the url for reading
html = urllib.request.urlopen(url , timeout = 10)
print(url)
# parsing the html file
htmlParse = BeautifulSoup(html, 'html.parser')
# getting all the paragraphs of articles
for para in htmlParse.find_all(['div'], class_='entry-content wp-block-post-content'):
List_articles.append(para.get_text())
# Getting respective month, date, year the article published
from datetime import datetime
date = htmlParse.find(itemprop="article:published_time").get("content")
match = re.search(r'\d{4}-\d{2}-\d{2}', date)
dt = datetime.strptime(match.group(), '%Y-%m-%d').date()
List_month.append(dt.month)
List_date.append(dt.day)
List_year.append(dt.year)
# getting all the headings of articles
for para in htmlParse.find_all(['h1'], class_='wp-block-post-title'):
List_header.append(para.get_text())
# getting all the source of articles
for para in htmlParse.find_all(['div'], class_='author-link ie_custom_theme_multiple_authors'):
List_source.append(para.get_text())
return List_articles, List_header, List_date, List_month, List_year, List_source
with concurrent.futures.ThreadPoolExecutor() as executor :
for i in range(len(filtered_Final_LIST)):
executor.submit(ext_url, (filtered_Final_LIST[i]))
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
# pip install trio httpx
mainurl = 'https://www.financialexpress.com/economy/'
news = [
'finmin-asks-ministries-to-restrict-expenses-within-prescribed-limit/2410766/',
'uk-inflation-hits-near-30-year-high-pressuring-boe-and-households/2410761/',
'economic-recovery-yet-to-attain-durability-says-report/2410690/',
'vagaries-of-weather-drive-near-13-lakh-maha-farmers-to-crop-insurance-scheme/2410030/'
]
allin = []
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
async def worker(receiver):
async with receiver:
async for client, new in receiver:
r = await client.get(mainurl + new)
soup = await get_soup(r.text)
prs = [x.text for x in soup.select(
'.entry-content > p:not(:last-child)')]
title = soup.select_one('.wp-block-post-title').text
author = soup.select_one('div.author-link a').text
publish = soup.select_one(
'[itemprop="article:published_time"]')['content'].split('T')[0].split('-')
target = [title, author, *publish, prs]
allin.append(target)
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(5):
nurse.start_soon(worker, receiver.clone())
async with sender:
for new in news:
await sender.send([client, new])
if __name__ == "__main__":
trio.run(main)
df = pd.DataFrame(
allin, columns=['Title', 'Author', 'Year', 'Month', 'Day', 'Paragraphs'])
print(df)
df.to_csv('data.csv', index=False)
import time
from bs4 import BeautifulSoup
import requests
from urllib.request import Request, urlopen
pages = ["movies", "series"]
printed = []
for page in pages:
req = Request("https://www.thenetnaija.com/videos/" + page, headers={'User-Agent': 'XYZ/3.0'})
webpage = urlopen(req, timeout=10)
b4 = BeautifulSoup(webpage, "html.parser")
movie_list = b4.find_all("div", {"class" : "video-files"})
for allContainers in movie_list:
filmName = allContainers.find('img').get('alt')
printed.append(filmName)
print(printed)
for get in printed:
requests.get("https://api.telegram.org/bot:AAEapVykIXdphGYaH5ZjXuhpFaFw7wpi5Bs/sendMessage?chat_id=&text={}".format(get))
I want to use a while loop to let the program run infinitely and only send the requests to my telegram chat if the data in the list has changed.
You can use this example as a basis how to check the movies/series periodically (The example is using set.difference to determine if there are changes):
import time
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
def get_movies(url):
headers = {"User-Agent": "XYZ/3.0"}
req = Request(url, headers=headers)
b4 = BeautifulSoup(urlopen(req, timeout=10), "html.parser")
return set(a.get_text(strip=True) for a in b4.select("h2 a"))
url = "https://www.thenetnaija.com/videos/{}"
pages = {
"movies": get_movies(url.format("movies")),
"series": get_movies(url.format("series")),
}
while True:
time.sleep(10) # <-- sleep 10sec before checking again
for k, v in pages.items():
new_movies = get_movies(url.format(k))
difference = new_movies.difference(v)
if difference:
print("New {}:".format(k))
print(difference)
pages[k] = new_movies
# do stuff here (post to telegram etc.)
# ...
else:
print("No new {}".format(k))
I am working upon a pubmed project where I need to extract the ids for free full text and free pmc articles.This is what my code is.
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email = "abc#gmail.com" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
free_article_ids = []
for id_ in record['IdList']:
req = requests.get(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
soup = BeautifulSoup(req.text, 'lxml')
status = soup.find('span', {'class':'status_icon'})
if status is None:
continue
elif status.text in ["Free full text", "Free PMC Article"]:
free_article_ids.append(id_)
print(free_article_ids)
Problem with my code is that it is taking way too much time for giving the result and I want to speed this process up. How do I do it?
Use multithreading to download concurrently. Recommend a simple framework.
from Bio import Entrez
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class MySpider(Spider):
name = 'ncbi.nlm.nih.gov'
start_urls = []
def __init__(self):
Entrez.email = "abc#gmail.com" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
for id_ in record['IdList']:
self.start_urls.append(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
Spider.__init__(self,self.name) #necessary
free_article_ids = []
def extract(self,url,html,models,modelNames):
doc = SimplifiedDoc(html)
status = doc.select('span.status_icon')
if status and status.text in ["Free full text", "Free PMC Article"]:
id = url.split('/')[-1]
self.free_article_ids.append(id)
return {"Urls": [], "Data": {"id":id}}
return True
SimplifiedMain.startThread(MySpider())
Here are more examples. https://github.com/yiyedata/simplified-scrapy-demo
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
def get_total_pages():
tut = []
base_url = 'Your group '
for url in [base_url % i for i in range(1, 27)]:
tut.append(url)
print(tut)
#get_data_from_page(tut)
pool = ThreadPool(8)
results = pool.map(get_data_from_page, tut)
def get_data_from_page(tut):
f = open("emails.txt", 'a')
email = []
for a in tut:
link = s.get(a).text
soup = BeautifulSoup(link, 'lxml')
links = soup.find('div', class_="mens").find_all('span', class_="inviz")
for e in links:
emails = e.text
f.write(emails + ', ')
email.append(emails)
print(email)
def main():
get_total_pages()
if __name__ == '__main__':
main()
This results in an error saying it only works with multiprocessing, and:
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
problem was in this
for a in tut:
link = s.get(a).text
and was needed
just
link = s.get(a).text
#without for