Login timeout while scraping - python-3.x

While I'm scraping boxrec (www.boxrec.com) using BeautifulSoup and the urllib library, my login timouts and the process stops. I need to logout and login manually to resume the process.
libraries I'm using:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#This is the final function that is getting interrupted
def AllDataWriter(rankingURL, nfileNameT):
uClientRanking = uReq(rankingURL)
try:
rankingURL_html = uClientRanking.read()
except (http.client.IncompleteRead) as e:
rankingURL_html = e.partial
rankingURL_soup = soup(rankingURL_html, 'html.parser')
rankingFighters = rankingURL_soup.findAll('a', {'class': 'personLink'})
with open(nfileNameT, mode='w', encoding="utf-8") as csv_file:
f = csv.writer(csv_file)
f.writerow(dataBaseHeader)
i=0
for item in rankingFighters:
thisURL = 'http://boxrec.com' + rankingFighters[i]['href']
fighterArray = getFighterData(thisURL)
for d in range(0,int((len(fighterArray))/38)):
u = d * 38
f.writerow(fighterArray[u:(u+38)])
i = i + 1

Related

How to create a while loop that continuously detects if scraped data in a list changes

import time
from bs4 import BeautifulSoup
import requests
from urllib.request import Request, urlopen
pages = ["movies", "series"]
printed = []
for page in pages:
req = Request("https://www.thenetnaija.com/videos/" + page, headers={'User-Agent': 'XYZ/3.0'})
webpage = urlopen(req, timeout=10)
b4 = BeautifulSoup(webpage, "html.parser")
movie_list = b4.find_all("div", {"class" : "video-files"})
for allContainers in movie_list:
filmName = allContainers.find('img').get('alt')
printed.append(filmName)
print(printed)
for get in printed:
requests.get("https://api.telegram.org/bot:AAEapVykIXdphGYaH5ZjXuhpFaFw7wpi5Bs/sendMessage?chat_id=&text={}".format(get))
I want to use a while loop to let the program run infinitely and only send the requests to my telegram chat if the data in the list has changed.
You can use this example as a basis how to check the movies/series periodically (The example is using set.difference to determine if there are changes):
import time
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
def get_movies(url):
headers = {"User-Agent": "XYZ/3.0"}
req = Request(url, headers=headers)
b4 = BeautifulSoup(urlopen(req, timeout=10), "html.parser")
return set(a.get_text(strip=True) for a in b4.select("h2 a"))
url = "https://www.thenetnaija.com/videos/{}"
pages = {
"movies": get_movies(url.format("movies")),
"series": get_movies(url.format("series")),
}
while True:
time.sleep(10) # <-- sleep 10sec before checking again
for k, v in pages.items():
new_movies = get_movies(url.format(k))
difference = new_movies.difference(v)
if difference:
print("New {}:".format(k))
print(difference)
pages[k] = new_movies
# do stuff here (post to telegram etc.)
# ...
else:
print("No new {}".format(k))

How to scrape if the whole page changes?

with this can reached the link 3 days ago:
import requests
from bs4 import BeautifulSoup
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
now there is only one "text",
what can be done at this time?
No listen URL! SID not found!
it wouldn't bother me so much if the link didn't work, but when printing, if there is a "broken" link in the links, the whole thing doesn't work
in this example the CINELIFEHD works, as soon as I add FIXTV the print no longer works because of the changed page
import requests
from bs4 import BeautifulSoup
html_url55 = "http://streamstat.net/videoplayer.cgi?sid=14358315&ext=.m3u8"
html_response = requests.get(html_url55)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
CINELIFEHD = vid['src']
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
print(
"#EXTM3U"
+ '\n' +"#EXTINF:0,tvg-logo=https://cinelife.com/wp-content/uploads/2020/04/cinelife_logo.png, CINE LIFE HD" + '\n' +
CINELIFEHD
+ '\n' + "#EXTINF:0,tvg-logo=http://1241.hu/userfiles/image/tvcsatornak/pic_atkoto_55_fix_tv.png, Fix" + '\n' +
FIXTV
)
from bs4 import BeautifulSoup
import requests
keys = [148177550, 14358315]
params = {
'ext': '.m3u8'
}
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def main(url):
with requests.Session() as req:
for k in keys:
params['sid'] = k
r = req.get(url, params=params)
soup = get_soup(r.text)
try:
goal = soup.select_one('source')['src']
except TypeError:
goal = "N/A"
print("Key: {:10}, Result: {}".format(k, goal))
main('http://streamstat.net/videoplayer.cgi')
Output:
Key: 148177550, Result: N/A
Key: 14358315, Result: https://magselect-stirr.amagi.tv/playlist1080p.m3u8

I need to scrape the job description text for every job title in the mentioned page

I need to scrape the job descriptions in the page () for every job title like section (accounting) job title (staff accountant) job description text inside the title in different columns in a csv file using python beautiful soup module.
I'm new to beautiful soup i tried some ways of doing it but its not working can you please help with the code
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
start = time.time()
url = ""
data = []
while True:
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
jobdesc = soup.find("li",{'class':'col-xs-12 col-sm-4'})
section=soup.find("h4")
jd = {"jobdescription":jobdesc.text,"topic":section.text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Here is one way leveraging :has in bs4 4.7.1+ to isolate the sections for looping over. zip_longest is used so we can join section title on to each job.
import requests, csv
from bs4 import BeautifulSoup as bs
from itertools import zip_longest
r = requests.get('https://resources.workable.com/job-descriptions/#', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
with open("data.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(['Section','Job Title'])
for section in soup.select('section:has(.job)'):
title = section.select_one('a').text.strip()
jobs = [job.text for job in section.select('li a')]
rows = list(zip_longest([title], jobs, fillvalue = title))
for row in rows:
w.writerow(row)
I had a 403 forbidden using requests package, so I decide to use selenium
You can try this:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = soup.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
jd = {"jobdescription":li.text,"topic":title}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
EDIT: To get description for all jobs
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = s.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
job = li.text
driver.get(li.find('a').get('href'))
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
jd = {"job":job,"topic":title, "description": soup2.find('div',{'class':'entry-content article-content'}).text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Scraping data from monster jobs and uploading to Mongo DB.
from time import *
from selenium import webdriver
import pymongo
from pymongo.results import InsertManyResult
import os
client = pymongo.MongoClient()
mydb = client['jobs']
collection = mydb['med_title']
driver = webdriver.Chrome("C:/Users/91798/Desktop/pythn_files/chromedriver.exe")
driver.get("https://www.monsterindia.com/")
driver.implicitly_wait(9)
driver.find_element_by_id("SE_home_autocomplete").send_keys("nursing , Therapist , docter , medical ,nurse , hospital")
#for normal search use this
driver.find_element_by_xpath("//body/div[#id='themeDefault']/section[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/form[1]/div[1]/div[2]/input[1]").click()
driver.implicitly_wait(20)
temp = 1
while(True):
if temp == 5:
break
all_jobs = driver.find_elements_by_class_name("card-apply-content")
link_list = []
for job in all_jobs:
try:
company = ""
com_name = job.find_elements_by_class_name("job-tittle")
driver.implicitly_wait(1)
for ele in com_name:
company = ele.find_element_by_class_name('company-name').text
job_title = ""
for ele in com_name:
job_title = ele.find_element_by_class_name('medium').text
location = job.find_element_by_class_name("loc").text
driver.implicitly_wait(1)
lnks= job.find_elements_by_tag_name("a")
for lnk in lnks:
link_list.append(lnk.get_attribute('href'))
break
driver.implicitly_wait(1)
desc = job.find_element_by_class_name("job-descrip").text
driver.implicitly_wait(1)
skills = job.find_element_by_class_name("descrip-skills").text
except:
desc = 'desc Not Specified'
skills = 'skills Not Specified'
location = ' location Not Specified'
company = 'company Not Specified'
job_title = 'job_title not specified'
s = skills.split(' ')
for i in s:
if i == ',':
s.remove(',')
data = {"job_title" : job_title ,"comapany_name": company,"job_location":
location,"job_desc":desc,"skills":s[2::],"card_link":link_list[0]}
link_list.clear()
y = collection.insert_one(data)
print(y.inserted_id)
driver.find_element_by_xpath("//button[contains(text(),'Next')]").click()
sleep(25)
temp = temp +1

How Can I Loop Through URLs and import TD elements from several Links

I am trying to import data from the following URLs, and write each data set to a CSV file.
Here are a few sample URls that I want to grab fundamental data from:
https://finviz.com/quote.ashx?t=sbuc
https://finviz.com/quote.ashx?t=msft
https://finviz.com/quote.ashx?t=aapl
How can I import the data from 'Index' to 'Change'?
I think the script should, basically, look like this.
import csv
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup("html.parser")
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
for stocks in tckr:
url_list = [url_base + tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
Except, when I run it, I get this error message: SyntaxError: EOL while scanning string literal
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('../Python/SOtest.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except HTTPError:
print("{} - not found".format(url))
I use requests so there is that difference. But it works so you can pull code from there if need be.

Requests exception with only multiprocessing.dummy import Pool

import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
def get_total_pages():
tut = []
base_url = 'Your group '
for url in [base_url % i for i in range(1, 27)]:
tut.append(url)
print(tut)
#get_data_from_page(tut)
pool = ThreadPool(8)
results = pool.map(get_data_from_page, tut)
def get_data_from_page(tut):
f = open("emails.txt", 'a')
email = []
for a in tut:
link = s.get(a).text
soup = BeautifulSoup(link, 'lxml')
links = soup.find('div', class_="mens").find_all('span', class_="inviz")
for e in links:
emails = e.text
f.write(emails + ', ')
email.append(emails)
print(email)
def main():
get_total_pages()
if __name__ == '__main__':
main()
This results in an error saying it only works with multiprocessing, and:
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
problem was in this
for a in tut:
link = s.get(a).text
and was needed
just
link = s.get(a).text
#without for

Resources