blacklist href in python to remove junk sites

blacklist href in python to remove junk sites - python-3.x

I want it to print every site that isnt blacklisted(how the code looks so far) but it doesnt work
if you change the string in the last if statement from pass to print(site) then it prints everything in the black list, yet it wont print everything that isnt blacklisted which is my goal
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
import fnmatch
url = ("http://stackoverflow.com")
blacklist = ['*stackoverflow.com*', '*stackexchange.com*']
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
site = (link.get('href'))
site = str(site)
for filtering in blacklist:
if fnmatch.fnmatch(site, filtering):
pass
else:
print(site)

You want something like:
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
import fnmatch
url = ("http://stackoverflow.com")
blacklist = ['*stackoverflow.com*', '*stackexchange.com*']
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
site = (link.get('href'))
site = str(site)
if any([fnmatch.fnmatch(site, filtering) for filtering in blacklist]):
continue
print(site)
The issue happens here (old code):
for filtering in blacklist:
if fnmatch.fnmatch(site, filtering):
pass
else:
print(site)
While you're iterating here, if the website is blacklisted it will match one condition but not the other, so it will always be printed.
There are multiple solutions, mine was to use any() to check if the result is True at least once and if it is, continue the loop and don't print :D

Related

Extracting links from a URL until there is data and then moving to the next URL

I have a URL from where I am trying to extract data. I have found a way to extract data from that url. but how do i then move to the next URL if the existing URL doesn't have any data?
The base URL have used in main function is,
https://posoco.in/reports/daily-reports/
instead I only want to extract data from 2020-21 and then go on from here,
https://posoco.in/reports/daily-reports/daily-reports-2020-21/
this way if the pdf files from 2020-21 is all extracted, then the program should start extracting from the next URL which is 2021-22 so on as long as the website exists in an automated way so program can check them automated every-year.
The code I have written.
#import libraries
import re
import tabula
import datetime
from datetime import datetime,timedelta
from datetime import timedelta, date
import requests
import pandas as pd
import glob
import logging
import os
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import wget
def source_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('daily-reports-' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
def get_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('_nldc_psp/?wpdmdl' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
if __name__ == "__main__":
url = 'https://posoco.in/reports/daily-reports/'
file_links = source_urls(url)
sorted_file_links = sorted(file_links)
for files in sorted_file_links:
sub_files = get_urls(files)
for x in sub_files:
print(x)
The program output
https://posoco.in/download/31.03.14_nldc_psp/?wpdmdl=3256
https://posoco.in/download/30.03.14_nldc_psp/?wpdmdl=3255
https://posoco.in/download/29.03.14_nldc_psp/?wpdmdl=3254
https://posoco.in/download/28.03.14_nldc_psp/?wpdmdl=3253
....
...
...
...
...
https://posoco.in/download/11-03-21_nldc_psp/?wpdmdl=35681
https://posoco.in/download/10-03-21_nldc_psp/?wpdmdl=35649
https://posoco.in/download/09-03-21_nldc_psp/?wpdmdl=35627
https://posoco.in/download/08-03-21_nldc_psp/?wpdmdl=35612
https://posoco.in/download/07-03-21_nldc_psp/?wpdmdl=35589
I have pasted all the libraries, but only a few are used here. rest are used to download and for processing and logging.

Well, you've to set an condition where it's will pickup the main urls which is equal/bigger than 2021-20
And then you can parse the inner urls.
Also, There's no need to use .get here!
Since you are using a condition to pickup urls include _nldc_psp/?wpdmdlwhich means if there's a pattern then return it. so you DONNOT need to replace the value with empty values link.get('href','') and then return it!
import httpx
import trio
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint as pp
cond = datetime.strptime('2020-21', '%Y-%d')
async def get_urls(client):
r = await client.get('https://posoco.in/reports/daily-reports/')
soup = BeautifulSoup(r.text, 'lxml')
return [x['href'] for x in soup.select('a[href*=reports-]') if datetime.strptime(x['href'].split('-', 3)[-1][:-1], '%Y-%d') >= cond]
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
links = await get_urls(client)
async def get_pdf(url):
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
pp([x['href']
for x in soup.select('a[href*="nldc_psp"]')])
if links:
for link in links:
nurse.start_soon(get_pdf, link)
if __name__ == "__main__":
trio.run(main)

web scraping from news articles

I have been trying to access the links from a given news website. I've found the code which works really well, but the only issue is that, it outputs "javascript:void();" along with all the other links. Please let me know what changes I can make such that I don't encounter "javascript:void();" in the output with all the other links.
The following is the code:
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import requests
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = requests.get("https://www.ndtv.com/coronavirus?pfrom=home-mainnavgation")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)
for link in soup.find_all('a', href=True):
print(link['href'])

If you don't want them, just filter them out.
Here's how:
import requests
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
resp = requests.get("https://www.ndtv.com/coronavirus?pfrom=home-mainnavgation")
http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(resp.content, 'html.parser', from_encoding=encoding)
for link in soup.find_all('a', href=True):
if link["href"] != "javascript:void();":
print(link['href'])

Python opening a link with a variable

I have a problem with my code created in python.
I would like the URL API telegram to open with a change so that the downloaded item from the site is sent to chat.
# Import libraries
import requests
import urllib.request
import time
import sys
from bs4 import BeautifulSoup
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
# Set the URL you want to webscrape from
url = 'https://31asdasdasdasdasd.com/'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object
soup = BeautifulSoup(response.text, "html.parser")
zapisane = ''
row = soup.find('strong')
print(">> Ilosc opinii ktora przeszla:")
send = print(row.get_text()) # Print row as text
import urllib.request
u = urllib.request.urlopen("https://api.telegram.org/botid:ts/sendMessage?chat_id=-3channel1&text=")

You likely want to use a string format with a variable in your last line of code shown here. Here's a helpful resource for string formatting: https://www.geeksforgeeks.org/python-format-function/

I am trying to extract text inside span_id, but getting blank output using python beautifulsoup

i am tring to extract text inside span-id tag but getting blank output screen.
i have tried using parent element div text also , but fail to extract, please anyone help me.
below is my code.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
i want the text inside that span which is given mobile number.

You'll have to use Selenium as that text is not present in the initial request, or at least no without searching through <script> tags.
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.paperplatemakingmachines.com/'
driver.get(url)
# It's better to use Selenium's WebDriverWait, but I'm still learning how to use that correctly
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)

The Data is actually rending dynamically through script. What you need to do is parse the data from script:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
script= soup.find('script')
mob = re.search("(?<=pns_no = \")(.*)(?=\";)", script.text).group()
print(mob)

Another way of using regex to find the number
import requests
import re
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.paperplatemakingmachines.com/',)
soup = bs(r.content, 'lxml')
r = re.compile(r'var pns_no = "(\d+)"')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
print('+91-' + script)

Get the name of Instagram profile and the date of post with Python

I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.

I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

blacklist href in python to remove junk sites - python-3.x

Related

Extracting links from a URL until there is data and then moving to the next URL

web scraping from news articles

Python opening a link with a variable

I am trying to extract text inside span_id, but getting blank output using python beautifulsoup

Get the name of Instagram profile and the date of post with Python

Categories

Resources