Python opening a link with a variable - python-3.x

I have a problem with my code created in python.
I would like the URL API telegram to open with a change so that the downloaded item from the site is sent to chat.
# Import libraries
import requests
import urllib.request
import time
import sys
from bs4 import BeautifulSoup
stdoutOrigin=sys.stdout
sys.stdout = open("log.txt", "w")
# Set the URL you want to webscrape from
url = 'https://31asdasdasdasdasd.com/'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object
soup = BeautifulSoup(response.text, "html.parser")
zapisane = ''
row = soup.find('strong')
print(">> Ilosc opinii ktora przeszla:")
send = print(row.get_text()) # Print row as text
import urllib.request
u = urllib.request.urlopen("https://api.telegram.org/botid:ts/sendMessage?chat_id=-3channel1&text=")

You likely want to use a string format with a variable in your last line of code shown here. Here's a helpful resource for string formatting: https://www.geeksforgeeks.org/python-format-function/

Related

BeautifulSoup Assignment got error module 'urllib' has no attribute 'urlopen' Can anyone provide solutions for this?

I am trying to do an assignment: write a Python program that expands on http://www.py4e.com/code3/urllinks.py. The program will use urllib to read the HTML from the data files below, extract the href= vaues from the anchor tags, scan for a tag that is in a particular position relative to the first name in the list, follow that link and repeat the process a number of times and report the last name you find.
Actual problem: Start at: http://py4e-data.dr-chuck.net/known_by_Kylen.html
Find the link at position 18 (the first name is 1). Follow that link. Repeat this process 7 times. The answer is the last name that you retrieve.
Hint: The first character of the name of the last page that you will load is: P[enter image description here][1]
#Code I used:
import re
import urllib
import urllib.request
import urllib.parse
import urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter URL:')
count = int(input('Enter count:'))
position = int(input('Enter position:'))-1
html = urllib.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html,"html.parser")
href = soup('a')
#print href
for i in range(count):
link = href[position].get('href', None)
print(href[position].contents[0])
html = urllib.urlopen(link).read()
soup = BeautifulSoup(html,"html.parser")
href = soup('a')
But got an error: html = urllib.urlopen(url, context=ctx).read()
AttributeError: module 'urllib' has no attribute 'urlopen'
Can anyone provide solutions for this?
You imported urlopen already, but never used it. Instead you used urllib.urlopen which doesn't exist.
Instead of using urllib.urlopen just use urlopen
Example:
from urllib.request import urlopen
# before: html = urllib.urlopen(url, context=ctx).read()
html = urlopen(url, context=ctx).read()

blacklist href in python to remove junk sites

I want it to print every site that isnt blacklisted(how the code looks so far) but it doesnt work
if you change the string in the last if statement from pass to print(site) then it prints everything in the black list, yet it wont print everything that isnt blacklisted which is my goal
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
import fnmatch
url = ("http://stackoverflow.com")
blacklist = ['*stackoverflow.com*', '*stackexchange.com*']
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
site = (link.get('href'))
site = str(site)
for filtering in blacklist:
if fnmatch.fnmatch(site, filtering):
pass
else:
print(site)
You want something like:
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
import fnmatch
url = ("http://stackoverflow.com")
blacklist = ['*stackoverflow.com*', '*stackexchange.com*']
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
site = (link.get('href'))
site = str(site)
if any([fnmatch.fnmatch(site, filtering) for filtering in blacklist]):
continue
print(site)
The issue happens here (old code):
for filtering in blacklist:
if fnmatch.fnmatch(site, filtering):
pass
else:
print(site)
While you're iterating here, if the website is blacklisted it will match one condition but not the other, so it will always be printed.
There are multiple solutions, mine was to use any() to check if the result is True at least once and if it is, continue the loop and don't print :D

Extracting links from a URL until there is data and then moving to the next URL

I have a URL from where I am trying to extract data. I have found a way to extract data from that url. but how do i then move to the next URL if the existing URL doesn't have any data?
The base URL have used in main function is,
https://posoco.in/reports/daily-reports/
instead I only want to extract data from 2020-21 and then go on from here,
https://posoco.in/reports/daily-reports/daily-reports-2020-21/
this way if the pdf files from 2020-21 is all extracted, then the program should start extracting from the next URL which is 2021-22 so on as long as the website exists in an automated way so program can check them automated every-year.
The code I have written.
#import libraries
import re
import tabula
import datetime
from datetime import datetime,timedelta
from datetime import timedelta, date
import requests
import pandas as pd
import glob
import logging
import os
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import wget
def source_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('daily-reports-' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
def get_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('_nldc_psp/?wpdmdl' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
if __name__ == "__main__":
url = 'https://posoco.in/reports/daily-reports/'
file_links = source_urls(url)
sorted_file_links = sorted(file_links)
for files in sorted_file_links:
sub_files = get_urls(files)
for x in sub_files:
print(x)
The program output
https://posoco.in/download/31.03.14_nldc_psp/?wpdmdl=3256
https://posoco.in/download/30.03.14_nldc_psp/?wpdmdl=3255
https://posoco.in/download/29.03.14_nldc_psp/?wpdmdl=3254
https://posoco.in/download/28.03.14_nldc_psp/?wpdmdl=3253
....
...
...
...
...
https://posoco.in/download/11-03-21_nldc_psp/?wpdmdl=35681
https://posoco.in/download/10-03-21_nldc_psp/?wpdmdl=35649
https://posoco.in/download/09-03-21_nldc_psp/?wpdmdl=35627
https://posoco.in/download/08-03-21_nldc_psp/?wpdmdl=35612
https://posoco.in/download/07-03-21_nldc_psp/?wpdmdl=35589
I have pasted all the libraries, but only a few are used here. rest are used to download and for processing and logging.
Well, you've to set an condition where it's will pickup the main urls which is equal/bigger than 2021-20
And then you can parse the inner urls.
Also, There's no need to use .get here!
Since you are using a condition to pickup urls include _nldc_psp/?wpdmdlwhich means if there's a pattern then return it. so you DONNOT need to replace the value with empty values link.get('href','') and then return it!
import httpx
import trio
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint as pp
cond = datetime.strptime('2020-21', '%Y-%d')
async def get_urls(client):
r = await client.get('https://posoco.in/reports/daily-reports/')
soup = BeautifulSoup(r.text, 'lxml')
return [x['href'] for x in soup.select('a[href*=reports-]') if datetime.strptime(x['href'].split('-', 3)[-1][:-1], '%Y-%d') >= cond]
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
links = await get_urls(client)
async def get_pdf(url):
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
pp([x['href']
for x in soup.select('a[href*="nldc_psp"]')])
if links:
for link in links:
nurse.start_soon(get_pdf, link)
if __name__ == "__main__":
trio.run(main)

I am trying to extract text inside span_id, but getting blank output using python beautifulsoup

i am tring to extract text inside span-id tag but getting blank output screen.
i have tried using parent element div text also , but fail to extract, please anyone help me.
below is my code.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
i want the text inside that span which is given mobile number.
You'll have to use Selenium as that text is not present in the initial request, or at least no without searching through <script> tags.
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.paperplatemakingmachines.com/'
driver.get(url)
# It's better to use Selenium's WebDriverWait, but I'm still learning how to use that correctly
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
The Data is actually rending dynamically through script. What you need to do is parse the data from script:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
script= soup.find('script')
mob = re.search("(?<=pns_no = \")(.*)(?=\";)", script.text).group()
print(mob)
Another way of using regex to find the number
import requests
import re
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.paperplatemakingmachines.com/',)
soup = bs(r.content, 'lxml')
r = re.compile(r'var pns_no = "(\d+)"')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
print('+91-' + script)

Get the name of Instagram profile and the date of post with Python

I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.
I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')

Resources