Scraping Site Data with out Selenium - python-3.x

Currently I am trying to pull CMS historical data from there site. I have got some working code to pull the download links from the page. My problem is that the links are divided into pages. I need to iterate through all the available pages and extract the download links. The obvious choice here is to use Selenium to click next pages and get data. Due to company policy i can not run selenium in the environment. Is there a way I can got through the pages and extract link. The website does not show the post link once you try to go to next page. I am out of ideas to try and get to next page without post link or not using selenium.
Current working code to pull links from first page
import pandas as pd
from datetime import datetime
#from selenium import webdriver
from lxml import html
import requests
def http_request_get(url, session=None, payload=None, parse=True):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """
if payload is None:
payload = {}
if session:
content = session.get(url, params=payload, verify=False, headers={"content-type":"text"})
else:
content = requests.get(url, params=payload, verify=False, headers={"content-type":"text"})
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
if parse:
return html.fromstring(content.text), content.url
else:
return content.text, content.url
def get_html(link):
"""
Returns a html.
"""
page_parsed, _ = http_request_get(url=link, payload={'t': ''}, parse=True)
return page_parsed
cmslink = "https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-
Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report"
content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
linkTable = content.cssselect('td[headers="view-dlf-1-title-table-column"]')[0]
headers = linkTable[0].xpath('//a/#href')
df1 = pd.DataFrame(headers,columns= ['links'])
df1SubSet = df1[df1['links'].str.contains('contract-summary', case=False)]

These are the two urls that will give you the total 166 entries. I have also changed the condition for capturing hrefs. Give this a try.
cmslinks=[
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=0',
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=1']
df=pd.DataFrame()
for cmslink in cmslinks:
print(cmslink)
content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
linkTable = content.cssselect('td[headers="view-dlf-1-title-table-column"]')[0]
headers = linkTable[0].xpath("//a[contains(text(),'Contract Summary') or contains(text(),'Monthly Enrollment by CPSC')]/#href")
df1 = pd.DataFrame(headers,columns= ['links'])
df=df.append(df1)

Related

URLS from website domain with "hidden" layers

I can not find a way to extract ALL URLs from the following website domains:
(1) https://www.ah.nl/zoeken?query=vegan
(2) https://www.jumbo.com/zoeken/?searchTerms=vegan
For the first, the problem is that the product are 'hidden' and as website visitor you need to select a button at the bottom page to show more items. I tried BeautifulSoup, but it does not extract the 'hidden' URLs.
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas
req = Request('https://www.ah.nl/zoeken?query=vegan')
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
df = pandas.DataFrame()
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
With the second website link, the issue is that there are multiple pages. Something that the previous code also does not work for. In a previous question, it was suggested to use:
url = 'https://www.sainsburys.co.uk/groceries-api/gol-services/product/v1/product'
payload ={
'filter[keyword]': 'vegan',
'include[PRODUCT_AD]': 'citrus',
'page_number': '1',
'page_size': '2000',
'sort_order': 'FAVOURITES_FIRST'
}
jsonData = requests.get(url, params=payload).json()
products = jsonData['products']
df = pd.DataFrame(products)
I have, however, not yet worked with request and parameters, and cannot figure out how to adjust these parameters to work with link (2).
Hopefully someone can help me with these 2 website links. Thank you.
ah.nl site:
First of all, we must get the number of pages from the request. Further on the cycle we get each page. For example, I output data to the console, these are Title, Price and Link.
import requests
import json
def get_data(query):
page_size = 36
url = f"https://www.ah.nl/zoeken/api/products/search?page=1&size={page_size}&query={query}"
response = requests.request("GET", url, proxies=proxies)
json_obj = json.loads(response.text)
for page in range(int(json_obj['page']['totalPages'])):
url = f"https://www.ah.nl/zoeken/api/products/search?page={page}&size={page_size}&query={query}"
response = requests.request("GET", url)
json_obj = json.loads(response.text)
for products in json_obj['cards']:
for product in products['products']:
print(product['title'], product['price']['now'], product['link'])
get_data('vegan')
If you have any questions, I'll be happy to answer. If you need a code example with a second site, write, I will do

Why not full data?

I try to get all specific span tags in all 3 urls
but finally the csv file only shows the data of last url.
Python code
from selenium import webdriver
from lxml import etree
from bs4 import BeautifulSoup
import time
import pandas as pd
urls = []
for i in range(1, 4):
if i == 1:
url = "https://www.coinbase.com/price/s/listed"
urls.append(url)
else:
url = "https://www.coinbase.com/price/s/listed" + f"?page={i}"
urls.append(url)
print(urls)
for url in urls:
wd = webdriver.Chrome()
wd.get(url)
time.sleep(30)
resp =wd.page_source
html = BeautifulSoup(resp,"lxml")
tr = html.find_all("tr",class_="AssetTableRowDense__Row-sc-14h1499-1 lfkMjy")
print(len(tr))
names =[]
for i in tr:
name1 = i.find("span",class_="TextElement__Spacer-hxkcw5-0 cicsNy Header__StyledHeader-sc-1xiyexz-0 kwgTEs AssetTableRowDense__StyledHeader-sc-14h1499-14 AssetTableRowDense__StyledHeaderDark-sc-14h1499-17 cWTMKR").text
name2 = i.find("span",class_="TextElement__Spacer-hxkcw5-0 cicsNy Header__StyledHeader-sc-1xiyexz-0 bjBkPh AssetTableRowDense__StyledHeader-sc-14h1499-14 AssetTableRowDense__StyledHeaderLight-sc-14h1499-15 AssetTableRowDense__TickerText-sc-14h1499-16 cdqGcC").text
names.append([name1,name2])
ns=pd.DataFrame(names)
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
path = "/Users/paul/jpn traffic/coinbase/coinbase"
ns.to_csv(path+date+date+'.csv',index=None)
the result of 2 print() function, it returns nothing wrong:
print(urls):['https://www.coinbase.com/price/s/listed', 'https://www.coinbase.com/price/s/listed?page=2', 'https://www.coinbase.com/price/s/listed?page=3']
print(len(tr))
26
30
16
So what's wrong with my code? Why not full data?
BTW, if I want to run my code on cloud service everyday at a given time, which works better for me, as a green hand python learner? I don't need to store huge data on cloud, I just need python scripts sending emails to my box that's it.
Why not data? Answer is data is generating from backdoor meaning the site is using API that's why data is not with the help of BeautifulSoup. You can easily get data using api_url and requests. To get api_url go to chrome devtools then network tab then xhr tab and click header tab then you will get the url and click preview tab to see data.
Now, data is generating:
import requests
r = requests.get('https://www.coinbase.com/api/v2/assets/search?base=BDT&country=BD&filter=listed&include_prices=true&limit=30&order=asc&page=2&query=&resolution=day&sort=rank')
coinbase = r.json()['data']
for coin in coinbase:
print(coin['name'])

BeautifulSoup WebScraping Issue: Cannot find specific classes for this specific Website (Python 3.7)

I am a bit new to webscraping, I have created webscrapers with the methods below before, however with this specific website I am running into an issue where the parser cannot locate the specific class ('mainTitle___mbpq1') this is the class which refers to the text of announcement. Whenever I run the code it returns None. This also the case for the majority of other classes. I want to capture this info without using selenium, since this slows the process down from what I understand. I think the issue is that it is a json file, and so script tags are being used (I may be completely wrong, just a guess), but I do not know much about this area, so any help would be much appreciated.
The code below I have attempted using, with no success.
from bs4 import BeautifulSoup
import re
import requests
# Method 1
url_4 = "https://www.kucoin.com/news/categories/listing"
res = requests.get(url_4)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
texts = soup.body
text = soup.body.div.find('div',{'class':'mainTitle___mbpq1'})
print(text)
from bs4 import BeautifulSoup
import urllib3
import re
# Method2
http = urllib3.PoolManager()
comm = re.compile("<!--|-->")
def make_soup(url):
page = http.request('GET', url)
soupdata = BeautifulSoup(page.data,features="lxml")
return soupdata
soup = make_soup(url_4)
Annouce_Info = soup.find('div',{'class':'mainTitle___mbpq1'})
print(Annouce_Info)
linkKuCoin Listing
The data is loaded from external source via Javascript. To print all article titles, you can use this example:
import json
import requests
url = "https://www.kucoin.com/_api/cms/articles"
params = {"page": 1, "pageSize": 10, "category": "listing", "lang": ""}
data = requests.get(url, json=params).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for item in data["items"]:
print(item["title"])
Prints:
PhoenixDAO (PHNX) Gets Listed on KuCoin!
LABS Group (LABS) Gets Listed on KuCoin! World Premiere!
Polkadex (PDEX) Gets Listed on KuCoin! World Premiere!
Announcement of Polkadex (PDEX) Token Sale on KuCoin Spotlight
KuCoin Futures Has Launched USDT Margined NEO, ONT, XMR, SNX Contracts
Introducing the Polkadex (PDEX) Token Sale on KuCoin Spotlight
Huobi Token (HT) Gets Listed on KuCoin!
KuCoin Futures Has Launched USDT Margined XEM, BAT, XTZ, QTUM Contracts
RedFOX Labs (RFOX) Gets Listed on KuCoin!
Boson Protocol (BOSON) Gets Listed on KuCoin! World Premiere!
If you trying to scrape information about new listings at crypto exchanges, you can be interested in this API:
https://rapidapi.com/Diver44/api/new-cryptocurrencies-listings/
import requests
url = "https://new-cryptocurrencies-listings.p.rapidapi.com/new_listings"
headers = {
'x-rapidapi-host': "new-cryptocurrencies-listings.p.rapidapi.com",
'x-rapidapi-key': "your-key"
}
response = requests.request("GET", url, headers=headers)
print(response.text)
It includes an endpoint with New Listings from the biggest exchanges and a very useful endpoint with information about exchanges where you can buy specific coins and prices for this coin at that exchanges

How can I scrape data which is not having any of the source code?

scrape.py
# code to scrape the links from the html
from bs4 import BeautifulSoup
import urllib.request
data = open('scrapeFile','r')
html = data.read()
data.close()
soup = BeautifulSoup(html,features="html.parser")
# code to extract links
links = []
for div in soup.find_all('div', {'class':'main-bar z-depth-1'}):
# print(div.a.get('href'))
links.append('https://godamwale.com' + str(div.a.get('href')))
print(links)
file = open("links.txt", "w")
for link in links:
file.write(link + '\n')
print(link)
I have successfully got the list of links by using this code. But When I want to scrape the data from those links from their html page, these don't have any of the source code that contains data,and to extract them it my job tough . I have used selenium driver , but it won't work well for me.
I want to scrape the data from the below link , that contains data in the html sections , which have Customer details, licence and automation, commercial details, Floor wise, operational details . I want to extract these data with name , location , contact number and type.
https://godamwale.com/list/result/591359c0d6b269eecc1d8933
it 's link here . If someone finds solution , please give it to me.
Using Developer tools in your browser, you'll notice whenever you visit that link there is a request for https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933 that returns a json response probably containing the data you're looking for.
Python 2.x:
import urllib2, json
contents = json.loads(urllib2.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read())
print contents
Python 3.x:
import urllib.request, json
contents = json.loads(urllib.request.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read().decode('UTF-8'))
print(contents)
Here you go , the main problem with the site seems to be it takes time to load that's why it was returning incomplete page source. you have to wait until page loads completely. notice time.sleep(8) this line in code below :
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import time
CHROMEDRIVER_PATH ="C:\Users\XYZ\Downloads/Chromedriver.exe"
wd = webdriver.Chrome(CHROMEDRIVER_PATH)
responce = wd.get("https://godamwale.com/list/result/591359c0d6b269eecc1d8933")
time.sleep(8) # wait untill page loads completely
soup = BeautifulSoup(wd.page_source, 'lxml')
props_list = []
propvalues_list = []
div = soup.find_all('div', {'class':'row'})
for childtags in div[6].findChildren('div',{'class':'col s12 m4 info-col'}):
props = childtags.find("span").contents
props_list.append(props)
propvalue = childtags.find("p",recursive=True).contents
propvalues_list.append(propvalue)
print(props_list)
print(propvalues_list)
note: code will return Construction details in 2 seperate list.

Scraping ecomm website for products info listed on one single page

I tried to scrape flipkart.com (I randomly opened a category that displayed 60 products).
However using BeautifulSoup when I searched for all the links , I didn't get the links pointing to each product. I obtained 37 links none of which pointed towards the product description page....HELP!!!
import requests
from bs4 import BeautifulSoup
# a random product listing page
url='https://www.flipkart.com/search?q=mobile&sid=tyy/4io&as=on&as-show=on&otracker=start&as-pos=1_1_ic_mobile'
r=requests.get(url)
soup=BeautifulSoup(r.text,from_encoding="utf-8")
links=soup.find_all('a')
It gave all the links except the links toproduct descrtiption page .
As I understand it (warning, I'm a noob): When you open the page in question with a normal browser, there is javascript in the page that when processed creates additional html that your browser adds to the document it shows you. When you use the requests module to get the page html it does not process this javascript so it never gets this extra content. The info you want is contained in this missing content. So:
Based on code from this thread: Web-scraping JavaScript page with Python
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup
# Take this class for granted.Just use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'https://www.flipkart.com/search?q=mobile&sid=tyy/4io&as=on&as-show=on&otracker=start&as-pos=1_1_ic_mobile'
r = Render(url)
result = r.frame.toHtml()
soup = BeautifulSoup(result, 'lxml')
links = soup.find_all('div', {'class': 'col col-7-12'})
target_links = [link.parent.parent.parent for link in links]
for link in target_links:
try:
print(link.find('a')['href'])
except TypeError: # we caught unwanted links in the find_all
pass
I'm sure the way I steered to the links could be improved.

Resources