Is there any way to get all request url - python-3.x

import mechanize
import json
import re
from bs4 import BeautifulSoup
url = "https://app.propertymeld.com/login/?next=/"
browser = mechanize.Browser()
browser.open(url)
browser.select_form(nr = 0)
browser.form['email'] = "aaa"
browser.form['password'] = "bbb"
result = browser.submit().read()
print(result)
soup = BeautifulSoup(result,"html.parser")
print(soup)
Is there any method to get request url from the website. After the website login the url is been hidden. Only javascript is available.

Related

Why Is JSON Truncated During Linux HTML Response Parsing?

`
import requests
from bs4 import BeautifulSoup
url = "https://music.163.com/discover/toplist?id=19723756"
headers = {
'User-Agent': "PostmanRuntime/7.15.2",
}
response = requests.request("GET", url, headers=headers)
r = response.text
soup = BeautifulSoup(response.text, "lxml")
textarea = soup.find('textarea', attrs={'id': 'song-list-pre-data'}).get_text()
print(textarea)
`
In the Linux environment, the matching result JSON is truncated.
the textarea :xxxxxx ee":0,"album":{"id":158052587,"name":"Sakana~( ˵>ㅿㅿ
I think it's probably because of the special symbols.
How do you deal with this situation?
You need to convert from string to JSON object list
then it can be print a song.
I tested Ubuntu 20.04 and Windows on VS code terminal.
Both are works.
Code
import requests
import json
from bs4 import BeautifulSoup
url = "https://music.163.com/discover/toplist?id=19723756"
headers = {
'User-Agent': "PostmanRuntime/7.15.2",
}
response = requests.request("GET", url, headers=headers)
soup = BeautifulSoup(response.text)
textarea = soup.find('textarea', attrs={'id': 'song-list-pre-data'}).get_text()
json_list = json.loads(textarea)
for song in json_list:
print("album:", song['album']['name'], ", artists: ", song['artists'][0]['name'], "duration: ", song['duration'])
Result on Ubuntu 20.04
Result on VS code Terminal at

How to implement Confluence Forum Authorization into CrawlSpider

I want to Crawl a Confluence Forum that requieres Authentication aswell as Authorization. After long research i somehow did manage to make a script that does exactly that but i´m kinda lost in how to implement Auth in front of the CrawlSpider including the Header with Auth-Token so the Spider is logged in and is able to Crawl all accessable Subsites.
Here´s my double Auth Script:
from urllib import response
from atlassian import Confluence
import subprocess
from urllib.parse import urlparse
import atlassian
from bs4 import BeautifulSoup
import getpass
import requests
import logging
from PyQt6.QtGui import *
from PyQt6.QtCore import *
from lxml import html
from selenium import webdriver
from selenium.webdriver.common.by import By
logging.basicConfig(filename='conf_connect.log', filemode='w', level=logging.DEBUG)
global url
url='http://MYCONFL.atlassian.net/wiki/'
confluence = Confluence(
url='http://MYCONFL.atlassian.net/wiki/',
username='USERNAME',
password='PASSWORD')
with requests.session() as s:
auth = ('USERNAME', getpass.getpass())
session = requests.session()
session.verify = True;
session.auth = auth;
headers = {
'Authorization': 'Basic base64encodedauthtoken',
'Content-Type': 'application/json'}
#storing response
response = requests.get(url)
if response.status_code != 200:
print("Error - Something went wrong!")
else:
print("all good, Code 200")
Here is the CrawlSpider´s Code:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tldextract
url_list = [
'https://www.MYCONFL.atlassian.com/wiki/'
]
class CrawlySpider(CrawlSpider):
name = 'crawly'
allowed_domains = ['MYCONFL.atlassian.com']
start_urls = url_list
rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
Rule(LinkExtractor(), callback='parse_page', follow=True),
)
def parse_page(self, response):
html = response.body
soup = BeautifulSoup(html, 'lxml')
text = soup.get_text()
domain = tldextract.extract(response.request.url)[1]
path = urlparse(response.request.url)[2].replace("/", "")
with open(f'{domain}.txt', 'a') as fp:
fp.write(text)
with open(f'{domain} {path}.html', 'wb') as fp:
fp.write(response.body)
Thanks in Advance for Advice :D

Why i have HTTP Error 503 using urllib and BS4?

I use BS4 to get Browse Standards by Technology from website: https://standards.globalspec.com/
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://standards.globalspec.com/"
q1 = urlopen(url)
soup = BeautifulSoup(q1, 'lxml')
print(soup)
But i have an error: urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable
Could anyone see what could be causing this error?
#Samt94 already has stated that the website is under cloudflare protection. So you can use cloudscraper instead of requests
from bs4 import BeautifulSoup
import cloudscraper
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',})
url = 'https://standards.globalspec.com/'
req = scraper.get(url)
print(req)
soup = BeautifulSoup(req.text,'lxml')
Output:
<Response [200]>
cloudscraper
You can use CloudScraper to access websites that use CloudFlare DDoS Protection:
from bs4 import BeautifulSoup
import cloudscraper
url = "https://standards.globalspec.com/"
scraper = cloudscraper.create_scraper()
q1 = scraper.get(url)
soup = BeautifulSoup(q1.text, 'lxml')
print(soup)

Python (beautiful-soup) returning "none" for existing html while crawling

i simply want to get the html of search bar of https://www.daraz.com.pk website. i have wrote a code and tried it on "https://www.amazon.com", "https://www.alibaba.com", "https://www.goto.com.pk" and many others, it works fine. but its not working on https://www.daraz.com.pk.
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib import request
import ssl
import requests
ssl._create_default_https_context = ssl._create_unverified_context
html = urlopen("https://www.daraz.com.pk")
bsObj = BeautifulSoup(html, features="lxml")
nameList = bsObj.find("input", {"type": "search"})
print(nameList)
it returns none, instead it should return :
input type="search" id="q" name="q" placeholder="Search in Daraz" class="search-box__input--O34g" tabindex="1" value="" data-spm-anchor-id="a2a0e.home.search.i0.35e34937eWCmbI"
i have also tried the similar code on amazon, alibaba and some other sites, which successfully returned their html:
html = urlopen("https://www.amazon.com")
bsObj = BeautifulSoup(html, features="lxml")
nameList = bsObj.find("input", {"type": "text"})
print(nameList)
i have also tried it with this way:
bsObj=BeautifulSoup(requests.get("https://www.daraz.com.pk").content,
"html.parser")
nameList = bsObj.find("input", {"type": "search"})
print(nameList)
and this way using selenium:
driver = webdriver.Firefox()
driver.get("https://www.daraz.com.pk")
time.sleep(2)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
time.sleep(2)
officials = soup.find("input", {"type":"search"})
print(str(officials))
but failed.

Python3 requests.get ignoring part of my URL (BEAUTIFULSOUP + PYTHON WEBSCRAPING)

I'm using requests.get like so:
import urllib3
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings()
cookies = {
'PYPF': '3OyMLS2-xJlxKilWEOSvMQXAhyCgIhvAxYfbB8S_5lGBxxAS18Z7I8Q',
'_ga': 'GA1.2.227320333.1496647453',
'_gat': '1',
'_gid': 'GA1.2.75815641.1496647453'
}
params = {
'platform': 'xbox'
}
page = requests.get("http://www.rl-trades.com/#pf=xbox", headers={'Platform': 'Xbox'}, verify=False, cookies=cookies, params=params).text
page
soup = BeautifulSoup(page, 'html.parser')
... etc.
But, from my results in testing, it seems requests.get is ignoring '/#pf=xbox' in 'http://www.rl-trades.com/#pf=xbox'.
Is this because I am having to set verify to false? What is going on here?

Resources