Scraping without specific strings in Python3 - python-3.x

I'm trying to scrape only emoji in Python3. I used starttwith method with if statement but the result got some Unicodes that emoji's HTML tag seems to be same as others. I have no idea why some emoji is converted into Unicode. Could you give me any advice ?? or there is any ways to remove this unicode from list.
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import re
import os
list0 = []
site_url = "https://www.emojiall.com/zh-hant/categories/A"
get_url = requests.get(site_url)
soup = BeautifulSoup(get_url.text, "lxml")
for script in soup(["span"]):
script.extract()
emojis = soup.select('.emoji_font')
words = soup.select('.emoji_name_truncate')
for emoji0 in emojis:
emoji1 = emoji0.getText()
if not repr(emoji1).startswith(r'\U'):
list0.append(emoji1)
else:
continue
print(list0)

I updated editor and it works well .
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import re
import os
list0 = []
site_url = "https://www.emojiall.com/zh-hant/categories/A"
get_url = requests.get(site_url)
soup = BeautifulSoup(get_url.text, "lxml")
for script in soup(["span"]):
script.extract()
emojis = soup.select('.emoji_font')
words = soup.select('.emoji_name_truncate')
for emoji0 in emojis:
emoji1 = emoji0.getText()
if not repr(emoji1).startswith(r"'\U"):
list0.append(emoji1)
else:
continue
print(list0)

Related

Extracting links from a URL until there is data and then moving to the next URL

I have a URL from where I am trying to extract data. I have found a way to extract data from that url. but how do i then move to the next URL if the existing URL doesn't have any data?
The base URL have used in main function is,
https://posoco.in/reports/daily-reports/
instead I only want to extract data from 2020-21 and then go on from here,
https://posoco.in/reports/daily-reports/daily-reports-2020-21/
this way if the pdf files from 2020-21 is all extracted, then the program should start extracting from the next URL which is 2021-22 so on as long as the website exists in an automated way so program can check them automated every-year.
The code I have written.
#import libraries
import re
import tabula
import datetime
from datetime import datetime,timedelta
from datetime import timedelta, date
import requests
import pandas as pd
import glob
import logging
import os
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import wget
def source_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('daily-reports-' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
def get_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('_nldc_psp/?wpdmdl' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
if __name__ == "__main__":
url = 'https://posoco.in/reports/daily-reports/'
file_links = source_urls(url)
sorted_file_links = sorted(file_links)
for files in sorted_file_links:
sub_files = get_urls(files)
for x in sub_files:
print(x)
The program output
https://posoco.in/download/31.03.14_nldc_psp/?wpdmdl=3256
https://posoco.in/download/30.03.14_nldc_psp/?wpdmdl=3255
https://posoco.in/download/29.03.14_nldc_psp/?wpdmdl=3254
https://posoco.in/download/28.03.14_nldc_psp/?wpdmdl=3253
....
...
...
...
...
https://posoco.in/download/11-03-21_nldc_psp/?wpdmdl=35681
https://posoco.in/download/10-03-21_nldc_psp/?wpdmdl=35649
https://posoco.in/download/09-03-21_nldc_psp/?wpdmdl=35627
https://posoco.in/download/08-03-21_nldc_psp/?wpdmdl=35612
https://posoco.in/download/07-03-21_nldc_psp/?wpdmdl=35589
I have pasted all the libraries, but only a few are used here. rest are used to download and for processing and logging.
Well, you've to set an condition where it's will pickup the main urls which is equal/bigger than 2021-20
And then you can parse the inner urls.
Also, There's no need to use .get here!
Since you are using a condition to pickup urls include _nldc_psp/?wpdmdlwhich means if there's a pattern then return it. so you DONNOT need to replace the value with empty values link.get('href','') and then return it!
import httpx
import trio
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint as pp
cond = datetime.strptime('2020-21', '%Y-%d')
async def get_urls(client):
r = await client.get('https://posoco.in/reports/daily-reports/')
soup = BeautifulSoup(r.text, 'lxml')
return [x['href'] for x in soup.select('a[href*=reports-]') if datetime.strptime(x['href'].split('-', 3)[-1][:-1], '%Y-%d') >= cond]
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
links = await get_urls(client)
async def get_pdf(url):
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
pp([x['href']
for x in soup.select('a[href*="nldc_psp"]')])
if links:
for link in links:
nurse.start_soon(get_pdf, link)
if __name__ == "__main__":
trio.run(main)

I am trying to extract text inside span_id, but getting blank output using python beautifulsoup

i am tring to extract text inside span-id tag but getting blank output screen.
i have tried using parent element div text also , but fail to extract, please anyone help me.
below is my code.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
i want the text inside that span which is given mobile number.
You'll have to use Selenium as that text is not present in the initial request, or at least no without searching through <script> tags.
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.paperplatemakingmachines.com/'
driver.get(url)
# It's better to use Selenium's WebDriverWait, but I'm still learning how to use that correctly
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
The Data is actually rending dynamically through script. What you need to do is parse the data from script:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
script= soup.find('script')
mob = re.search("(?<=pns_no = \")(.*)(?=\";)", script.text).group()
print(mob)
Another way of using regex to find the number
import requests
import re
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.paperplatemakingmachines.com/',)
soup = bs(r.content, 'lxml')
r = re.compile(r'var pns_no = "(\d+)"')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
print('+91-' + script)

error while using selenium and writing to file

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
from pytube import YouTube
browser = webdriver.Chrome("C:/Users/Downloads/chromedriver_win32/chromedriver.exe")
browser.get("https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos")
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 100
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
tags = soup.find_all('a')
fname = "C:/Stock_in_CFD/Output.txt"
text_file = open(fname, "w+", encoding="utf-8")
for tag in tags:
t = tag.get('href')
text_file.write(t)
When I am running the above code. I am getting error
TypeError: write() argument must be str, not None
When I am not using selenium I am able to do it.
I am using selenium since I want scroll down entire page before parsing before using BeautifulSoup

Beautifulsoup image downloading error

I am trying to download the images from the imageurls that come back from a Beautifulsoup scrape. I was trying to get this code to work after reading on some other examples tho the
Getting error:
f.write(requests.get(img))
TypeError: a bytes-like object is required, not 'Response
line: f.write(requests.get(img)[What goes here?]) is causing me trouble now. I use soup = BeautifulSoup(source, 'html.parser')
Where as the reference uses soup = BeautifulSoup(r.content)
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
import urllib.request
from bs4 import BeautifulSoup
import codecs
import sys
import requests
from os.path import basename
lines = open('list.txt').read().splitlines()
for designers in lines:
for i in range(1):
url = 'https://www.example.com/Listings?st=' + author + '{}'.format(i)
source = urllib.request.urlopen(url)
soup = BeautifulSoup(source, 'html.parser')
for products in soup.find_all('li', class_='widget'):
image = products.find('img', class_='lazy-load')
print(image.get('data-src'))
img = (image.get('data-src'))
with open(basename(img), "wb") as f:
f.write(requests.get(img)**[What goes here?]**)
Thanks!

I might have some Korean letter enconding issue.

I am using python 3.6 and pycharm 2016.2 and trying to crawl an web site.
In the category of "보험사고이력 정보 : 내차 피해" (which includes the fifth tables), I tried to crawl the data if one of "p tag" has "- 사고일자" in its contents.
Below is my code. It keeps returning nothing.
Please help.
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse
import re
popup_insurance = "http://www.bobaedream.co.kr/mycar/popup/mycarChart_B.php?car_number=35%EB%91%908475&tbl=cyber&cno=651451"
res = urllib.request.urlopen(popup_insurance)
html = res.read()
soup_insurance = BeautifulSoup(html, 'html.parser')
insurance_content_table = soup_insurance.find_all('table')
elem = soup_insurance.find("p", text="보험사고이력 정보 : 내차 피해")
while elem.string != "보험사고이력 정보 : 타차 가해":
if "사고일자" in elem.next_sibling:
print(elem.next_sibling)
elem = elem.next_sibling
if elem is None:
break
You should loop through elem.next_sibling, NavigableString's can be odd sometimes:
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse
import re
popup_insurance = "http://www.bobaedream.co.kr/mycar/popup/mycarChart_B.php?car_number=35%EB%91%908475&tbl=cyber&cno=651451"
res = urllib.request.urlopen(popup_insurance)
html = res.read()
soup_insurance = BeautifulSoup(html, 'html.parser')
insurance_content_table = soup_insurance.find_all('table')
elem = soup_insurance.find("p", text="보험사고이력 정보 : 내차 피해")
while elem.string != "보험사고이력 정보 : 타차 가해":
for string in elem.next_sibling:
if "사고일자" in string:
print(elem.next_sibling.string.strip())
elem = elem.next_sibling
if elem is None:
break
I am assuming (since you did not provide expected output) that you wanted the Accident Date / Repair cost bit.
This is nowhere near perfect or even elegant, I'm almost sure this can be done with just the for loop.

Resources