Bs4 scrape table specifik - python-3.x

I hope you guys are always healthy
I want to scrape a more specific table using BS4. this is my code:
from bs4 import BeautifulSoup
import requests
url = 'test.com'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
for row in soup.select('tbody tr'):
row_text = [x.text for x in row.find_all('td')]
print (row_text)
how do you get results like this:
Number, Name, address, telp, komoditi
1, "ABON JUARA" JUARA FOOD INDUSTRY, Jl. Jend Sudirman 339, Salatiga, Jawa Tengah, 0298-324060, Abon Sapi Dan Ayam
and saved in CSV

import requests
from bs4 import BeautifulSoup
import csv
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select_one("table#newspaper-a").select("tr[valign=top]")
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["No", "Name", "Address", "Tel", "Komoditi"])
for item in target:
item = list(item.stripped_strings)
item[3] = item[3][6:]
writer.writerow(item)
main("https://kemenperin.go.id/direktori-perusahaan?what=&prov=&hal=1")
Output: view-online

Related

How to scrape dynamic content with beautifulsoup?

Here's my script :
import warnings
warnings.filterwarnings("ignore")
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
URLs = ['https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2']
TypeVendor = []
NameVendor = []
Marques = []
Brands = []
Refs = []
Prices = []
#Carts = []
#Links = []
Links = []
#df = pd.read_csv('testlink4.csv')
n=1
for url in URLs:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
TypeVendor.append('Distributeur')
NameVendor.append('Frayssinet')
Marques.append('Longines')
Brands.append(soup.find('span', class_ = 'main-detail__name').text)
Refs.append(soup.find('span', class_ = 'main-detail__ref').text)
Prices.append(soup.find('span', class_ = 'prix').text)
Links.append(url)
I understand why it doesn't work, text isn't adapted for dynamic content. But I cannot figure it out how to scrape this kind of content. I know if you find where the json data is sotred, yo ucan tweak with it and scrape the data.
But I checked on the google developer tools, on the network tab and I didn't find anything.
Set headers to your request and store your information in a more structured way.
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0'}
URLs = ['https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2']
data = []
for url in URLs:
results = requests.get(url,headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
data.append({
'name': soup.find('span', class_ = 'main-detail__name').get_text(strip=True),
'brand': soup.find('span', class_ = 'main-detail__marque').get_text(strip=True),
'ref':soup.find('span', class_ = 'main-detail__ref').get_text(strip=True),
'price':soup.find('span', {'itemprop':'price'}).get('content'),
'url':url
})
pd.DataFrame(data)
Output
name
brand
ref
price
url
Montre The Longines Legend Diver L3.774.4.30.2
Longines
Référence : L3.774.4.30.2
2240
https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2

How can I get BeautifulSoup info in the same row?

I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.
your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')

How to print item in a list without a for loop

I'm trying to just get the price off a website and found that "class="udYkAW2UrhZln2Iv62EYb" " gave me the price in one line. but when I try to print it out I keep getting
<span class="udYkAW2UrhZln2Iv62EYb">$0.312423</span>
and not just the price itself. I fixed this by using a for loop to get me item, but is there a way to just display the price with a print function without a for loop?
Please and thank you.
Here's the code
from bs4 import BeautifulSoup as bs
import requests
url = 'https://robinhood.com/crypto/DOGE'
r = requests.get(url)
#make to soup
soup = bs(r.content, 'lxml')
#where the price of the search was found "span class='udYkAW2UrhZln2Iv62EYb'"
#Using find() because this is the first instance of this class
price_class = soup.find('span', {'class' : 'udYkAW2UrhZln2Iv62EYb'})
print(price_class)
type(price_class)
#outout: <span class="udYkAW2UrhZln2Iv62EYb">$0.312423</span>
#output: bs4.element.Tag
for i in price_class:
print(i)
#output: $0.312423
Use .text or .get_text():
from bs4 import BeautifulSoup as bs
import requests
url = "https://robinhood.com/crypto/DOGE"
r = requests.get(url)
soup = bs(r.content, "lxml")
price = soup.find("span", {"class": "udYkAW2UrhZln2Iv62EYb"})
print(price.text) # <--- use .text
Prints:
$0.315917

'AttributeError: 'NoneType' object has no attribute '_contains_''

import requests
from bs4 import BeautifulSoup
import pandas as pd
import pdfkit
import re
URL = 'https://timesofindia.indiatimes.com/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
all_links=set()
for link in soup.find_all('a'):
all_links.add(link.get('href'))
s = list(all_links)
print(s)
x=[i for i in s if i._contains_(URL)]
m=[]
find_words= ['cbse', 'first-day']
for s in x:
if any(f in s for f in find_words):
m.append(s)
print(m)
your contains line is not valid.
Try
x=[i for i in s if URL in i]

BeautifulSoup Absoute URLs Print to CSV

I've been going through tons of threads on here to see if I can find a way to fix this code but cant quite seem to get this to work. I'm trying to scrape links from a site then write to csv. Here's the code:
I found a way to get 95% of the way there but am missing something for getting just the href:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import csv
j = urllib.request.urlopen("http://cnn.com")
soup = BeautifulSoup(j, "lxml")
data = soup.find_all('a', href=True)
for url in soup.find_all('a', href=True):
#print(url.get('href'))
with open('marcel.csv', 'w', newline='') as csvfile:
write = csv.writer(csvfile)
write.writerows(data)
Here is probably what you want to do.
from bs4 import BeautifulSoup
import requests #better than urllib
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("marcel.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
I use openpyxl to get it
from openpyxl import Workbook,load_workbook
I think it is very easy.
it is a part of my project,you could try it
def createExcel(self):
wb = Workbook(optimized_write=True)
ws = wb.create_sheet(title='书籍列表')
row0 = ['编号','条码号','题名','责任者','借阅日期','归还日期','馆藏地']
ws.append(row0)
save_path = 'book_hist.xlsx'
wb.save(save_path)
def saveToExcel(self,data_list):
wb = load_workbook(filename='book_hist.xlsx')
ws = wb.get_sheet_by_name('书籍列表')
for i in range(len(data_list)):
ws.append(data_list[i])
save_path = 'book_hist.xlsx'
wb.save(save_path)

Resources