import mechanize
import json
import re
from bs4 import BeautifulSoup
url = "https://app.propertymeld.com/login/?next=/"
browser = mechanize.Browser()
browser.open(url)
browser.select_form(nr = 0)
browser.form['email'] = "aaa"
browser.form['password'] = "bbb"
result = browser.submit().read()
print(result)
soup = BeautifulSoup(result,"html.parser")
print(soup)
Is there any method to get request url from the website. After the website login the url is been hidden. Only javascript is available.
Related
`
import requests
from bs4 import BeautifulSoup
url = "https://music.163.com/discover/toplist?id=19723756"
headers = {
'User-Agent': "PostmanRuntime/7.15.2",
}
response = requests.request("GET", url, headers=headers)
r = response.text
soup = BeautifulSoup(response.text, "lxml")
textarea = soup.find('textarea', attrs={'id': 'song-list-pre-data'}).get_text()
print(textarea)
`
In the Linux environment, the matching result JSON is truncated.
the textarea :xxxxxx ee":0,"album":{"id":158052587,"name":"Sakana~( ˵>ㅿㅿ
I think it's probably because of the special symbols.
How do you deal with this situation?
You need to convert from string to JSON object list
then it can be print a song.
I tested Ubuntu 20.04 and Windows on VS code terminal.
Both are works.
Code
import requests
import json
from bs4 import BeautifulSoup
url = "https://music.163.com/discover/toplist?id=19723756"
headers = {
'User-Agent': "PostmanRuntime/7.15.2",
}
response = requests.request("GET", url, headers=headers)
soup = BeautifulSoup(response.text)
textarea = soup.find('textarea', attrs={'id': 'song-list-pre-data'}).get_text()
json_list = json.loads(textarea)
for song in json_list:
print("album:", song['album']['name'], ", artists: ", song['artists'][0]['name'], "duration: ", song['duration'])
Result on Ubuntu 20.04
Result on VS code Terminal at
I want to Crawl a Confluence Forum that requieres Authentication aswell as Authorization. After long research i somehow did manage to make a script that does exactly that but i´m kinda lost in how to implement Auth in front of the CrawlSpider including the Header with Auth-Token so the Spider is logged in and is able to Crawl all accessable Subsites.
Here´s my double Auth Script:
from urllib import response
from atlassian import Confluence
import subprocess
from urllib.parse import urlparse
import atlassian
from bs4 import BeautifulSoup
import getpass
import requests
import logging
from PyQt6.QtGui import *
from PyQt6.QtCore import *
from lxml import html
from selenium import webdriver
from selenium.webdriver.common.by import By
logging.basicConfig(filename='conf_connect.log', filemode='w', level=logging.DEBUG)
global url
url='http://MYCONFL.atlassian.net/wiki/'
confluence = Confluence(
url='http://MYCONFL.atlassian.net/wiki/',
username='USERNAME',
password='PASSWORD')
with requests.session() as s:
auth = ('USERNAME', getpass.getpass())
session = requests.session()
session.verify = True;
session.auth = auth;
headers = {
'Authorization': 'Basic base64encodedauthtoken',
'Content-Type': 'application/json'}
#storing response
response = requests.get(url)
if response.status_code != 200:
print("Error - Something went wrong!")
else:
print("all good, Code 200")
Here is the CrawlSpider´s Code:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
url_list = [
'https://www.MYCONFL.atlassian.com/wiki/'
]
class CrawlySpider(CrawlSpider):
name = 'crawly'
allowed_domains = ['MYCONFL.atlassian.com']
start_urls = url_list
rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
html = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
domain = tldextract.extract(response.request.url)[1]
path = urlparse(response.request.url)[2].replace("/", "")
with open(f'{domain}.txt', 'a') as fp:
fp.write(text)
with open(f'{domain} {path}.html', 'wb') as fp:
fp.write(response.body)
Thanks in Advance for Advice :D
I use BS4 to get Browse Standards by Technology from website: https://standards.globalspec.com/
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://standards.globalspec.com/"
q1 = urlopen(url)
soup = BeautifulSoup(q1, 'lxml')
print(soup)
But i have an error: urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable
Could anyone see what could be causing this error?
#Samt94 already has stated that the website is under cloudflare protection. So you can use cloudscraper instead of requests
from bs4 import BeautifulSoup
import cloudscraper
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://standards.globalspec.com/'
req = scraper.get(url)
print(req)
soup = BeautifulSoup(req.text,'lxml')
Output:
<Response [200]>
cloudscraper
You can use CloudScraper to access websites that use CloudFlare DDoS Protection:
from bs4 import BeautifulSoup
import cloudscraper
url = "https://standards.globalspec.com/"
scraper = cloudscraper.create_scraper()
q1 = scraper.get(url)
soup = BeautifulSoup(q1.text, 'lxml')
print(soup)
i simply want to get the html of search bar of https://www.daraz.com.pk website. i have wrote a code and tried it on "https://www.amazon.com", "https://www.alibaba.com", "https://www.goto.com.pk" and many others, it works fine. but its not working on https://www.daraz.com.pk.
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib import request
import ssl
import requests
ssl._create_default_https_context = ssl._create_unverified_context
html = urlopen("https://www.daraz.com.pk")
bsObj = BeautifulSoup(html, features="lxml")
nameList = bsObj.find("input", {"type": "search"})
print(nameList)
it returns none, instead it should return :
input type="search" id="q" name="q" placeholder="Search in Daraz" class="search-box__input--O34g" tabindex="1" value="" data-spm-anchor-id="a2a0e.home.search.i0.35e34937eWCmbI"
i have also tried the similar code on amazon, alibaba and some other sites, which successfully returned their html:
html = urlopen("https://www.amazon.com")
bsObj = BeautifulSoup(html, features="lxml")
nameList = bsObj.find("input", {"type": "text"})
print(nameList)
i have also tried it with this way:
bsObj=BeautifulSoup(requests.get("https://www.daraz.com.pk").content,
"html.parser")
nameList = bsObj.find("input", {"type": "search"})
print(nameList)
and this way using selenium:
driver = webdriver.Firefox()
driver.get("https://www.daraz.com.pk")
time.sleep(2)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
time.sleep(2)
officials = soup.find("input", {"type":"search"})
print(str(officials))
but failed.
I'm using requests.get like so:
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings()
cookies = {
'PYPF': '3OyMLS2-xJlxKilWEOSvMQXAhyCgIhvAxYfbB8S_5lGBxxAS18Z7I8Q',
'_ga': 'GA1.2.227320333.1496647453',
'_gat': '1',
'_gid': 'GA1.2.75815641.1496647453'
}
params = {
'platform': 'xbox'
}
page = requests.get("http://www.rl-trades.com/#pf=xbox", headers={'Platform': 'Xbox'}, verify=False, cookies=cookies, params=params).text
page
soup = BeautifulSoup(page, 'html.parser')
... etc.
But, from my results in testing, it seems requests.get is ignoring '/#pf=xbox' in 'http://www.rl-trades.com/#pf=xbox'.
Is this because I am having to set verify to false? What is going on here?