Python 3.6.4, Scraping a website that requires login - python-3.x

Login Address: https://joffice.jeunesseglobal.com/login.asp.
Two data need to put: Username and pw.
Using cookie to access:https://joffice.jeunesseglobal.com/members/back_office.asp
Can't login.
#-*-coding:utf8-*-
import urllib
import http.cookiejar
url = 'https://joffice.jeunesseglobal.com/members/back_office.asp'
login_url = "https://joffice.jeunesseglobal.com/login.asp"
login_username = "jianghong181818"
login_password = "Js#168168!"
login_data = {
"Username" : login_username,
"pw" : login_password,
}
post_data = urllib.parse.urlencode(login_data).encode('utf-8')
headers = {'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
req = urllib.request.Request(login_url, headers = headers, data = post_data)
cookie = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
resp = opener.open(req)
print(resp.read().decode('utf-8'))

Use requests
Simple way:
>>>import requests
>>>page = requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=
('username', 'password'))
Making requests with HTTP Basic Auth
>>> from requests.auth import HTTPBasicAuth
>>> requests.get(" https://joffice.jeunesseglobal.com/login.asp", auth=HTTPBasicAuth('user', 'pass'))

Related

How to scrape an article which requires login to view full content using python?

I am trying to scrape an article from The Wall Street Journal and it requires log-in to view the whole content. So, I have written a code like the below using Python Requests:
import requests
from bs4 import BeautifulSoup
import re
import base64
import json
username= <username>
password= <password>
base_url= "https://accounts.wsj.com"
session = requests.Session()
r = session.get("{}/login".format(base_url))
soup = BeautifulSoup(r.text, "html.parser")
credentials_search = re.search("Base64\.decode\('(.*)'", r.text, re.IGNORECASE)
base64_decoded = base64.b64decode(credentials_search.group(1))
credentials = json.loads(base64_decoded)
connection = <connection_name>
r = session.post(
'https://sso.accounts.dowjones.com/usernamepassword/login',
data = {
"username": username,
"password": password,
"connection": connection,
"client_id": credentials["clientID"],
"state": credentials["internalOptions"]["state"],
"nonce": credentials["internalOptions"]["nonce"],
"scope": credentials["internalOptions"]["scope"],
"tenant": "sso",
"response_type": "code",
"protocol": "oauth2",
"redirect_uri": "https://accounts.wsj.com/auth/sso/login"
})
soup = BeautifulSoup(r.text, "html.parser")
login_result = dict([
(t.get("name"), t.get("value"))
for t in soup.find_all('input')
if t.get("name") is not None
])
r = session.post(
'https://sso.accounts.dowjones.com/login/callback',
data = login_result,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"},
)
# article get request
r = session.get(
"https://www.wsj.com/articles/singapore-prime-minister-lee-rejects-claims-he-misused-state-powers-in-family-feud-1499094761",
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"}
)
print(r.text)
am able to login through the request but still I am not getting full article to scrape. Can anyone help me with this? Thanks in advance :-)
An easy and reliable solution is using the selenium webdriver.
With selenium you create an automated browser window which opens the website and from there you can make it choose the elements to log in. Then the content loads in as usual, like when you look for it manually on your browser.
Then you can soup that page with BeautifulSoup.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox(executable_path="C:\Program Files (x86)\geckodriver.exe")
# Download the driver for your desired browser and place it in any path
# for Chrome it's: driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
# open your website link
driver.get("https://www.your-url.com")
# then soup the page with BS
html = driver.page_source
page_soup = BeautifulSoup(html)
From there you can use the "page_soup" as you usually would.
Any questions? :)

WebScraping / Identical sites not working?

i would like to scrape the header-element from these both links -
For me this 2 sites look absolute identical - pics see below
Why is only the scraping for the second link working and not for the first?
import time
import requests
from bs4 import BeautifulSoup
# not working
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"First Link: {erg}")
# working
link = "https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4"
page = requests.get (link)
time.sleep (1)
soup = BeautifulSoup (page.content, "html.parser")
erg = soup.find("header")
print(f"Second Link: {len(erg)}")
Working:
Not Working:
The page is sometimes loaded by JavaScript, so request won't support it.
You can use a while loop to check if header appears in the soup and then break
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
link = "https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4"
while True:
soup = BeautifulSoup(requests.get(link).content, "html.parser")
header = soup.find("header")
if header:
break
print(header)
Try this to get whatever fields you wish to grab from those links. curently it fetches the title. You can modify res.json()['data'][0]['attributes']['name'] to grab any field of your interest. Mkae sure to put the urls within this list urls_to_scrape.
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
urls_to_scrape = {
'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4',
'https://apps.apple.com/us/app/jackpot-boom-casino-slots/id1554995201?uo=4'
}
base_url = 'https://apps.apple.com/us/app/bingo-story-live-bingo-games/id1179108009?uo=4'
link = 'https://amp-api.apps.apple.com/v1/catalog/US/apps/{}'
params = {
'platform': 'web',
'additionalPlatforms': 'appletv,ipad,iphone,mac',
'extend': 'customPromotionalText,customScreenshotsByType,description,developerInfo,distributionKind,editorialVideo,fileSizeByDevice,messagesScreenshots,privacy,privacyPolicyText,privacyPolicyUrl,requirementsByDeviceFamily,supportURLForLanguage,versionHistory,websiteUrl',
'include': 'genres,developer,reviews,merchandised-in-apps,customers-also-bought-apps,developer-other-apps,app-bundles,top-in-apps,related-editorial-items',
'l': 'en-us',
'limit[merchandised-in-apps]': '20',
'omit[resource]': 'autos',
'sparseLimit[apps:related-editorial-items]': '5'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
res = s.get(base_url)
soup = BeautifulSoup(res.text,"lxml")
token_raw = soup.select_one("[name='web-experience-app/config/environment']").get("content")
token = json.loads(unquote(token_raw))['MEDIA_API']['token']
s.headers['Accept'] = 'application/json'
s.headers['Referer'] = 'https://apps.apple.com/'
s.headers['Authorization'] = f'Bearer {token}'
for url in urls_to_scrape:
id_ = url.split("/")[-1].strip("id").split("?")[0]
res = s.get(link.format(id_),params=params)
title = res.json()['data'][0]['attributes']['name']
print(title)

Instagram Scraping with endpoints requires authentication for all requests

As you know, Instagram announced they has changed their endpoint apis this month.
Looks like in the wake of Cambridge Analytica instagram has changed up their endpoint formats and require a logged in user session for all requests.....
Not sure which endpoints need updating but I was specifically using the media/comments endpoints which are now as follows:
Media OLD:
https://www.instagram.com/graphql/query/?query_id=17888483320059182&id={0}&first=100&after={1}
Media NEW:
https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%7B%22id%22%3A%2221575514%22%2C%22first%22%3A12%2C%22after%22%3A%22AQAHXuz1DPmI3FFLOzy5iKEhHOLKw3lt_ozVR40TphSdns0Vp5j_ZEU6Qj0CW6IqNtVGO5pmLCQoX0Y8RVS9aRTT2lWPp6vf8vFqjo1QfxRYmA%22%7D
The script that I used for avoiding this problem is as following:
#!/usr/bin/env python3
import requests
import urllib.parse
import hashlib
import json
#CHROME_UA = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
def getSession_old(rhx_gis, csrf_token, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
print(variables)
values = "%s:%s:%s:%s" % (
rhx_gis,
csrf_token,
CHROME_UA,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'user-agent': CHROME_UA,
'x-instagram-gis': x_instagram_gis
}
print(x_instagram_gis)
session.cookies.set('ig_pr', '2')
session.cookies.set('csrftoken', csrf_token)
return session
def getSession(rhx_gis, variables):
""" Get session preconfigured with required headers & cookies. """
#"rhx_gis:csfr_token:user_agent:variables"
values = "%s:%s" % (
rhx_gis,
variables)
x_instagram_gis = hashlib.md5(values.encode()).hexdigest()
session = requests.Session()
session.headers = {
'x-instagram-gis': x_instagram_gis
}
return session
if __name__ == '__main__':
session = requests.Session()
session.headers = { 'user-agent': CHROME_UA }
response = session.get("https://www.instagram.com/selenagomez")
data = json.loads(response.text.split("window._sharedData = ")[1].split(";</script>")[0])
csrf = data['config']['csrf_token']
rhx_gis = data['rhx_gis']
variables = '{"id":"460563723","first":10,"after":"AQBf8puhlt8nU2JzmYdMMTuH0FbMgUM1fnIOZIH7n94DM4VLWkVILUAKVB-5dqvxQEI-Wd0ttlEDzimaaqwC98jccQaDQT4tSF56c_NlWi_shg"}'
session = getSession(rhx_gis, variables)
query_hash = '42323d64886122307be10013ad2dcc44'
encoded_vars = urllib.parse.quote(variables, safe='"')
url = 'https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s' % (query_hash, encoded_vars)
print(url)
print(session.get(url).text)
I am sure this script was working well before 11 days ago, but not working now.
Does anyone know the solution how to get user posts without authenticating?

Why using cookie to login a website failed in python3?

I want to use cookie that i have got to access a website and get some information only can be seen after user login, but when me try it, the result shows the user isn't login the website, this is my code, someone can tell me how to fix the problem?
LOGIN_URL ="https://www.yaozh.com/login/"
values = {'username': 'username', 'pwd': 'password'} # , 'submit' : 'Login'
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
page = response.read().decode()
# print(page)
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
cookie.save(ignore_discard=True, ignore_expires=True)
print(cookie)
for item in cookie:
print('Name = ' + item.name)
print('Value = ' + item.value)
get_url = 'https://db.yaozh.com/instruct?p=1&pageSize=20'
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print(get_response.read())
bs=BeautifulSoup(get_response,"html.parser")
urls=bs.find_all(name='a',attrs={"href":re.compile("\.doc")},recursive=True)
print(len(urls))
for url in urls:
print(url["href"])
The problem has been solved, if you face same problem, i guess you should chcek the information post to server is all? Many websites need some imformation that true users can't see to judge the requester is true user. Good luck for you!

Python 3 - Add custom headers to urllib.request Request

In Python 3, the following code obtains the HTML source for a webpage.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
response = urllib.request.urlopen(url)
response.read()
How can I add the following custom header to the request when using urllib.request?
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
The request headers can be customized by first creating a request object then supplying it to urlopen.
import urllib.request
url = "https://docs.python.org/3.4/howto/urllib2.html"
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=hdr)
response = urllib.request.urlopen(req)
response.read()
Source: Python 3.4 Documentation
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen("url")
response.read()
Should you wish to learn about the details you can refer to the python documentation: https://docs.python.org/3/library/urllib.request.html
#Using urllib.request, with urlopen, allows to open the specified URL.
#Headers can be included inside the urlopen along with the url.
from urllib.request import urlopen
url = "https://docs.python.org/3.4/howto/urllib2.html"
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
response = urlopen(url, headers=header)
response.read()

Resources