keeping a session open with urllib in python 3.6 - python-3.x

I'm trying to login scrape tumblr but as when you log into the website normally through a browser, it kind of has two steps (enter email first and it check whether there is an account associated with that email then you can enter your password if the email is correct). Unfortunately, this flags up some problems when trying to automate this login without using the requests module (I'm trying to do it using urllib.request and urllib.parse which are already available in python 3.6) as there is no explicit way to start a session so you can keep the same session for the email verification and then entering the email.
Do I need to use cookies to do this or will I have to install the requests module? My code so far looks a bit like this:
import urllib.request
import urllib.parse
from html.parser import HTMLParser
input_tags = []
class myHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == "input":
for i in range(len(attrs)):
if attrs[i][0] == "name" and attrs[i][1] == "form_key":
input_tags.append(attrs[i+1][1])
parser = myHTMLParser()
form_key = ""
def get_form_key():
global form_key
global input_tags
url = "https://www.tumblr.com/login"
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)
resp = resp.read()
parser.feed(str(resp))
print(input_tags)
form_key = input_tags
print("form key is : ", form_key)
if len(form_key) > 1:
form_key = form_key[:1]
print("\nform key should be one value long now: ", form_key)
get_form_key()
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36"
url = "https://www.tumblr.com/login"
login_data = {
"determine_email" : "my.email#email.com",
"user[email]" : "my.email#email.com",
"user[password]" : "secretpassword",
"tumblrlog[name]" : "",
"user[age]" : "",
"http_referer" : "https://www.tumblr.com/logout",
"form_key" : form_key
}
encoded_data = urllib.parse.urlencode(data)
encoded_data = encoded_data.encode("utf-8")
request = urllib.request.Request(url, headers = headers, data = encoded_data)
response = urllib.request.urlopen(request)
response_url = response.geturl()
print(response_url)
This prints out the form key twice (not that important that's just from me bug checking) and then it returns the url:
https://www.tumblr.com/login
which indicates that the loin was not successful.
Any idea how to fix this?

Related

How to get a link with web scraping

I would like to create a web scraping with some Python library (Beautiful Soup, for example) to collect the YouTube links on this page:
https://www.last.fm/tag/rock/tracks
Basically, I want to download the title of the song, the name of the artist and the link to Youtube. Can anyone help me with some code?
Here's how you can do it:
from bs4 import BeautifulSoup
import requests
url = 'https://www.last.fm/tag/rock/tracks'
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3"
}
links = []
response = requests.get(url, headers=headers)
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
soup.encode('utf-8')
urls = soup.find_all(class_ = 'chartlist-name')
for url in urls:
relative_link = url.find('a')['href']
link = 'https://www.last.fm/' + relative_link
links.append(link)
print(links)
With the fuction soup.find_all you find all the tag with the class: "chartlist-name".
The for loop is used to remove the html tags and to append the links in the "links" list
In the future, provide some code to show what you have attempted.
I have expanded on Fabix answer. The following code gets the Youtube link, song name, and artist for all 20 pages on the source website.
from bs4 import BeautifulSoup
import requests
master_url = 'https://www.last.fm/tag/rock/tracks?page={}'
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3"
}
for i in range(1,20):
response = requests.get(master_url.format(i), headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
chart_items = soup.find_all(class_='chartlist-row')
for chart_item in chart_items:
youtube_link = chart_item.find('a')['href']
artist = chart_item.find('td', {'class':'chartlist-artist'}).find('a').text
song_name = chart_item.find('td', {'class': 'chartlist-name'}).find('a').text
print('{}, {}, {}'.format(song_name, artist, youtube_link))

aiohttp: Trying to connect to a site

I'm making a Discord Bot in Python to scrape Hack The Box data.
This is already functional, but I want to use async with aiohttp for increase speed when I'm requesting each profile of each member.
So in the synchronous version, I made a login function that first make a get request, to get the token on the login page, then make a post request with the token, email and password.
And in the asynchronous version with aiohttp, when I do my post request, my session is not connected.
I shortened it a little bit just for performance testing:
import requests
import re
import json
from scrapy.selector import Selector
import config as cfg
from timeit import default_timer
class HTBot():
def __init__(self, email, password, api_token=""):
self.email = email
self.password = password
self.api_token = api_token
self.session = requests.Session()
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36"
}
self.payload = {'api_token': self.api_token}
if path.exists("users.txt"):
with open("users.txt", "r") as f:
self.users = json.loads(f.read())
else:
self.users = []
def login(self):
req = self.session.get("https://www.hackthebox.eu/login", headers=self.headers)
html = req.text
csrf_token = re.findall(r'type="hidden" name="_token" value="(.+?)"', html)
if not csrf_token:
return False
data = {
"_token": csrf_token[0],
"email": self.email,
"password": self.password
}
req = self.session.post("https://www.hackthebox.eu/login", data=data, headers=self.headers)
if req.status_code == 200:
print("Connecté à HTB !")
self.session.headers.update(self.headers)
return True
print("Connexion impossible.")
return False
def extract_user_info(self, htb_id):
infos = {}
req = self.session.get("https://www.hackthebox.eu/home/users/profile/" + str(htb_id), headers=self.headers)
if req.status_code == 200:
body = req.text
html = Selector(text=body)
infos["username"] = html.css('div.header-title > h3::text').get().strip()
infos["avatar"] = html.css('div.header-icon > img::attr(src)').get()
infos["points"] = html.css('div.header-title > small > span[title=Points]::text').get().strip()
infos["systems"] = html.css('div.header-title > small > span[title="Owned Systems"]::text').get().strip()
infos["users"] = html.css('div.header-title > small > span[title="Owned Users"]::text').get().strip()
infos["respect"] = html.css('div.header-title > small > span[title=Respect]::text').get().strip()
infos["country"] = Selector(text=html.css('div.header-title > small > span').getall()[4]).css('span::attr(title)').get().strip()
infos["level"] = html.css('div.header-title > small > span::text').extract()[-1].strip()
infos["rank"] = re.search(r'position (\d+) of the Hall of Fame', body).group(1)
infos["challs"] = re.search(r'has solved (\d+) challenges', body).group(1)
infos["ownership"] = html.css('div.progress-bar-success > span::text').get()
return infos
return False
def refresh_user(self, htb_id, new=False):
users = self.users
for user in users:
if user["htb_id"] == htb_id:
infos = self.extract_user_info(htb_id)
def refresh_all_users(self):
users = self.users
for user in users:
self.refresh_user(user["htb_id"])
elapsed = default_timer() - START_TIME
time_completed_at = "{:5.2f}s".format(elapsed)
print("{0:<30} {1:>20}".format(user["username"], time_completed_at))
print("Les users ont été mis à jour !")
htbot = HTBot(cfg.HTB['email'], cfg.HTB['password'], cfg.HTB['api_token'])
htbot.login()
START_TIME = default_timer()
htbot.refresh_all_users()
Then, my async rewrite only for the login function :
import asyncio
import re
import config as cfg
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36"
}
LOGIN_LOCK = asyncio.Lock()
async def login():
async with LOGIN_LOCK:
async with aiohttp.TCPConnector(share_cookies=True) as connector:
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
async with session.get("https://www.hackthebox.eu/login") as req:
html = await req.text()
csrf_token = re.findall(r'type="hidden" name="_token" value="(.+?)"', html)
if not csrf_token:
return False
payload = {
"_token": csrf_token[0],
"email": cfg.HTB['email'],
"password": cfg.HTB['password']
}
async with session.post('https://www.hackthebox.eu/login', data=payload) as req:
print(await req.text())
exit()
async def main():
await login()
asyncio.run(main())
I think I'm going too far with this BaseConnector, Locks etc but I've been working on it for two days now and I'm running out of ideas, I'm already trying to connect with this post request.
I also did a comparison of the two requests with Requests and aiohttp in Wireshark.
The only difference is that the one with aiohttp doesn't send keepalive and has cookies. (I already tried to manually set the header "connection: keep-alive" but it doesn't change anything).
However, according to the documentation, keep-alive should be active by default, so I don't understand.
(In the screen the 301 status codes are normal, for seeing my HTTP requests I had to use http instead of https.)
Screen of Wireshark : https://files.catbox.moe/bignh0.PNG
Thank you if you can help me !
Since I'm new to asynchronous programming, I'll take all your advice.
Unfortunately almost everything I read about it on the internet is deprecated for Python 3.7+ and doesn't use the new syntaxes.
Okay, I have finally switched to httpx and it worked like a charm.
I really don't know why aiohttp wouldn't work.

I want to open the first link that appear when i do a search on google

I want to get the first link from the html parser, but I'm getting anything(tried to print).
Also when i inspect the page on browser, the links are under class='r'
But when i print the soup.prettify(), and closely analyse then i find there is no class='r', instead class="BNeawe UPmit AP7Wnd".
Please help, thanks in advance!
import requests
import sys
import bs4
import webbrowser
def open_web(query):
res = requests.get('https://google.com/search?q=' + query)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
link_elements = soup.select('.r a')
link_to_open = min(1, len(link_elements))
for i in range(link_to_open):
webbrowser.open('https://google.com' + link_elements[i].get('href'))
open_web('youtube')
The problem is that google serves different HTML when you don't specify User-Agent in headers. To add User-Agent to your request, put it in the headers= attribute:
import requests
import bs4
def open_web(query):
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'}
res = requests.get('https://google.com/search?q=' + query, headers=headers)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
link_elements = soup.select('.r a')
print(link_elements)
open_web('youtube')
Prints:
[<a href="https://www.youtube.com/?gl=EE&hl=et" onmousedown="return rwt(this,'','','','1','AOvVaw2lWnw7oOhIzXdoFGYhvwv_','','2ahUKEwjove3h7onkAhXmkYsKHbWPAUYQFjAAegQIBhAC','','',event)"><h3 class="LC20lb">
... and so on.
You received a completely different HTML with different elements and selectors thus the output is empty. The reason why Google blocks your request is because default requests user-agent is python-requests and Google understands it and blocks it. Check what's your user-agent.
User-agent let identifies the browser, its version number, and its host operating system that representing a person (browser) in a Web context that lets servers and network peers identify if it's a bot or not.
Sometimes you can receive a different HTML, with different selectors.
You can pass URL params as a dict() which is more readable and requests do everything for you automatically (same goes for adding user-agent into headers):
params = {
"q": "My query goes here"
}
requests.get("YOUR_URL", params=params)
If you want to get the very first link then use select_one() instead.
Code and full example in the online IDE:
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "My query goes here"
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
link = soup.select_one('.yuRUbf a')['href']
print(link)
# https://dev.mysql.com/doc/refman/8.0/en/entering-queries.html
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you only need to extract the data you want from JSON string rather than figuring out how to extract, maintain or bypass blocks from Google.
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"engine": "google",
"q": "My query goes here",
"hl": "en",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
# [0] means first index of search results
link = results['organic_results'][0]['link']
# https://dev.mysql.com/doc/refman/8.0/en/entering-queries.html
Disclaimer, I work for SerpApi.

Python 3.6.4, Scraping a website that requires login

Login Address: https://joffice.jeunesseglobal.com/login.asp.
Two data need to put: Username and pw.
Using cookie to access:https://joffice.jeunesseglobal.com/members/back_office.asp
Can't login.
#-*-coding:utf8-*-
import urllib
import http.cookiejar
url = 'https://joffice.jeunesseglobal.com/members/back_office.asp'
login_url = "https://joffice.jeunesseglobal.com/login.asp"
login_username = "jianghong181818"
login_password = "Js#168168!"
login_data = {
"Username" : login_username,
"pw" : login_password,
}
post_data = urllib.parse.urlencode(login_data).encode('utf-8')
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
req = urllib.request.Request(login_url, headers = headers, data = post_data)
cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
resp = opener.open(req)
print(resp.read().decode('utf-8'))
Use requests
Simple way:
>>>import requests
>>>page = requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=
('username', 'password'))
Making requests with HTTP Basic Auth
>>> from requests.auth import HTTPBasicAuth
>>> requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=HTTPBasicAuth('user', 'pass'))

Flurry scraping using python3 requests.Session()

This seems really straight forward, but for some reason this isn't connecting to flurry correctly and I unable to scrape the data.
loginurl = "https://dev.flurry.com/secure/loginPage.do"
csvurl = "https://dev.flurry.com/eventdata"
session = requests.Session()
login = session.post(loginurl, data={'loginEmail': 'user', 'loginPassword': 'pass'})
data = session.get(csvurl)
Every time I try to use this, I get redirected back to the login screen (loginurl) without fetching the new data. Has anyone been able to connect to flurry like this successfully before?
Any and all help would be greatly appreciated, thanks.
There are two more form fields to be populated struts.token.name and the value from struts.token.name i.e token, you also have to post to loginAction.do:
You can do an initial get and parse the values using bs4 then post the data:
from bs4 import BeautifulSoup
import requests
loginurl = "https://dev.flurry.com/secure/loginAction.do"
csvurl = "https://dev.flurry.com/eventdata"#
data = {'loginEmail': 'user', 'loginPassword': 'pass'}
with requests.Session() as session:
session.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36"})
soup = BeautifulSoup(session.get(loginurl).content)
name = soup.select_one("input[name=struts.token.name]")["value"]
data["struts.token.name"] = name
data[name] = soup.select_one("input[name={}]".format(name))["value"]
login = session.post(loginurl, data=data)

Resources