How do I stop argparser from printing default search? - python-3.x

Link to my code: https://pastebin.com/y4zLD2Dp
The imports that have not been used are going to be used as I progress through my project, I just like to have all imports I need ready to go. The goal for this program will be a youtube video downloader into mp3 format first. This is my first big project to my standards, only been coding for just over 2 months.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import sqlite3
import argparse
import sys
from selenium import webdriver
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser
#To get a developer key visit https://console.developers.google.com.
DEVELOPER_KEY = ""
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
'''Searches for results on youtube and stores them in the database.'''
def yt_search(options):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
search = youtube.search().list(q=options.q, part="id,snippet",
maxResults=options.max_results).execute()
video_id = []
#Add results to a list and print them.
for result in search.get("items", []):
if result["id"]["kind"] == "youtube#video":
video_id.append("%s (%s)" % (result["snippet"]["title"],
result["id"]["videoId"]))
else:
continue
print("Videos:\n", "\n".join(video_id), "\n")
def download(args):
print(args)
"""Arguments for the program."""
if __name__ == '__main__':
parser = argparse.ArgumentParser(description= "This program searches for
Youtube links and allows you to download songs from said list. " +
"Please remember to be specific in your
searches for best results.")
parser.add_argument("--q", help="Search term", default="Aeryes")
parser.add_argument("--max-results", help="Max results", default=25)
parser.add_argument("--d", type=download, help="Download a video from
search results.")
args = parser.parse_args()
if len(sys.argv) < 2:
parser.parse_args(['--help'])
sys.exit(1)
try:
yt_search(args)
except HttpError:
print("HTTP error")
The problem that I am having is that upon running the --d cmd in the CLI it works and prints the arg as expected (This is just a test to see that functions are working with the parser) but after it prints a list of default youtube links from --q default which I do not want it to do. How do I stop this from happening. Should I use subparser or is there something that I am missing?
If anyone has good resources for argparser module other than official doc support please share.

Related

Selenium can't find a CSS selector

Selenium catches a NoSuchElementException after retrieving exactly 9 entries from the website. I think the problem might be in that the page contents doesn't have enough time to load, but I'm not sure.
I've written the code following this YouTube tutorial (nineteenths minute).
import requests
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time
driver = webdriver.Chrome()
URL = 'https://www.alibaba.com//trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText=white+hoodie'
time.sleep(1)
driver.get(URL)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
items = driver.find_elements_by_css_selector('.J-offer-wrapper')
num = 1
for i in items:
print(num)
product_name = i.find_element_by_css_selector('h4').text
price = i.find_element_by_css_selector('.elements-offer-price-normal').text
time.sleep(0.5)
num += 1
print(price, product_name)
#driver.close()
If you have a clue why Selenium stops at the 10th entry and how to overcome this issue, please, share.
You are getting that because the 10th item is not like the rest. It's an ad thingy and not a hoodie as you've searched for. I suspect you'd want to exclude this so you are left only with the results you are actually interested in.
All you need to do is change the way you identify items (this just one of the options):
items = driver.find_elements_by_css_selector('.img-switcher-parent')
You need to update for the error handling as below:
for i in items:
print(num)
try:
product_name = i.find_element_by_css_selector('h4').text
except:
product_name=''
try:
price = i.find_element_by_css_selector('.elements-offer-pricenormal').text
except:
price=''
time.sleep(0.5)
num += 1
print(price, product_name)

How to Download webpage as .mhtml

I am able to successfully open a URL and save the resultant page as a .html file. However, I am unable to determine how to download and save a .mhtml (Web Page, Single File).
My code is:
import urllib.parse, time
from urllib.parse import urlparse
import urllib.request
url = ('https://www.example.com')
encoded_url = urllib.parse.quote(url, safe='')
print(encoded_url)
base_url = ("https://translate.google.co.uk/translate?sl=auto&tl=en&u=")
translation_url = base_url+encoded_url
print(translation_url)
req = urllib.request.Request(translation_url, headers={'User-Agent': 'Mozilla/6.0'})
print(req)
response = urllib.request.urlopen(req)
time.sleep(15)
print(response)
webContent = response.read()
print(webContent)
f = open('GoogleTranslated.html', 'wb')
f.write(webContent)
print(f)
f.close
I have tried to use wget using the details captured in this question:
How to download a webpage (mhtml format) using wget in python but the details are incomplete (or I am simply unabl eto understand).
Any suggestions would be helpful at this stage.
Did you try using Selenium with a Chrome Webdriver to save page?
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_element_located
from selenium.webdriver.support.ui import WebDriverWait
import pyautogui
URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
FILE_NAME = ''
# open page with selenium
# (first need to download Chrome webdriver, or a firefox webdriver, etc)
driver = webdriver.Chrome()
driver.get(URL)
# wait until body is loaded
WebDriverWait(driver, 60).until(visibility_of_element_located((By.TAG_NAME, 'body')))
time.sleep(1)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if FILE_NAME != '':
pyautogui.typewrite(FILE_NAME)
pyautogui.hotkey('enter')
I have a better solution, which will not involve any possible manual operation and specify the path to hold the mhtml file. I learn this from a chinese blog . The key idea is to use chrome-dev-tools command.
The code is shown below as an example.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.qq.com/')
# Execute Chrome dev tool command to obtain the mhtml file
res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
# 2. write file locally
with open('./store/qq.mhtml', 'w', newline='') as f:
f.write(res['data'])
driver.quit()
Hope this will help!
more things about chrome dev protocols
save as mhtml, need to add argument '--save-page-as-mhtml'
options = webdriver.ChromeOptions()
options.add_argument('--save-page-as-mhtml')
driver = webdriver.Chrome(options=options)
I wrote it just the way it was. Sorry if it's wrong.
I created a class, so you can use it. The example is in the three lines below.
Also, you can change the number of seconds to sleep as you like.
Incidentally, non-English keyboards such as Japanese and Hangul keyboards are also supported.
import chromedriver_binary
from selenium import webdriver
import pyautogui
import pyperclip
import uuid
class DonwloadMhtml(webdriver.Chrome):
def __init__(self):
super().__init__()
self._first_save = True
time.sleep(2)
def save_page(self, url, filename=None):
self.get(url)
time.sleep(3)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if filename is None:
pyperclip.copy(str(uuid.uuid4()))
else:
pyperclip.copy(filename)
time.sleep(1)
pyautogui.hotkey('ctrl', 'v')
time.sleep(2)
if self._first_save:
pyautogui.hotkey('tab')
time.sleep(1)
pyautogui.press('down')
time.sleep(1)
pyautogui.press('up')
time.sleep(1)
pyautogui.hotkey('enter')
time.sleep(1)
self._first_save = False
pyautogui.hotkey('enter')
time.sleep(1)
# example
dm = DonwloadMhtml()
dm.save_page('https://en.wikipedia.org/wiki/Python_(programming_language)', 'wikipedia_python') # create file named "wikipedia_python.mhtml"
dm.save_page('https://www.python.org/') # file named randomly based on uuid4
python3.8.10
selenium==4.4.3

Can not run 2 spiders successfully one after another in scrapy using a script

#I am trying to run a script following these requirements:
After running the demo10.py script, The AmazonfeedSpider will crawl the product information using the generated urls saved in Purl and save the output into the dataset2.json file
After successfully crawling and saving data into dataset2.json file , The ProductfeedSpider will run and grab the 5 urls returned by the Final_Product() method of CompareString Class..
Finally after grabing the final product_url list from Comparestring4 Class, The ProductfeedSpider will scrape data from the returned url list and save the result into Fproduct.json file.
#Here is the demo10.py file:
import scrapy
from scrapy.crawler import CrawlerProcess
from AmazonScrap.spiders.Amazonfeed2 import AmazonfeedSpider
from scrapy.utils.project import get_project_settings
from AmazonScrap.spiders.Productfeed import ProductfeedSpider
import time
# from multiprocessing import Process
# def CrawlAmazon():
def main():
process1 = CrawlerProcess(settings=get_project_settings())
process1.crawl(AmazonfeedSpider)
process1.start()
process1.join()
# time.sleep(20)
process2 = CrawlerProcess(settings=get_project_settings())
process2.crawl(ProductfeedSpider)
process2.start()
process2.join()
if __name__ == "__main__":
main()
#After running the file it causes exception in the compiletime and says that dataset.json file doesn't exist. Do I need to use multiprocessing in order to create delay between the spiders? then how can I implement it?
#I am looking forward to hearing from experts

Python stuck at last program execution

I am new to Python and I think I broke my python :(
I was trying Sentdex's PyQt4 YouTube tutorial right here.
I made the changes from PyQt4 to PyQt5. This is the code I was playing around. So I think, I messed up by printing the whole page on the console.
Now the output is:
Load finished
Look at you shinin!
Press any key to continue . . .
This is being shown for any code executed. That is python shows this code even if I try print("hello") in Visual code. I even tried to restart. Now like a virus, it is not clearing.
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://pythonprogramming.net/parsememcparseface/')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find('p', class_='jstest')
print js_test.text
print (soup)
#js_test = soup.find('div', class_='aqi-meter-panel')
#display.popen.terminate()
if __name__ == '__main__': main()
OK, so finally got the problem fixed..went manually inside the temp files in C:\Users\xxx\AppData\Local and started on a deletion rampage...removed many files and folder remotely related to python,vscode and conda...this gave an error warning first time I executed my program again...then on subsequent run...no issue...python back to its normal self...surprised that I was not able to find any solution on the net for this.

How to quit selenium chrome driver when the driver is not selected? [duplicate]

This question already has answers here:
Selenium : How to stop geckodriver process impacting PC memory, without calling driver.quit()?
(1 answer)
PhantomJS web driver stays in memory
(1 answer)
Closed 3 years ago.
I wrote some code in python using selenium and multiprocessing to parallelize data collection. I am collecting some data from YouTube. I have a method which initiates a chrome webdriver. I used multiprocessing to collect data faster. The issue is that when the timeout for the multiprocessing is reached, the function with the chromedriver exits the function before driver.quit() command can register. This leads to the accumulation of idle chromedrivers which I cannot close within python since (to my knowledge) there is no way to reference them. Is there any way to close all chromedrivers without explicitly using the driver objects?
I wrote the code in python3. The chromedriver is Chrome version 72.
# Web related modules
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from urllib.request import urlopen
import html2text
# YouTube download module
from pytube import YouTube
# Multiprocessing tools
from multiprocessing import Lock, Manager, Queue, Pool
import multiprocessing as mp
# Misc modules
import time, re, pickle, os, shutil, argparse, glob, unicodedata, datetime
from argparse import RawTextHelpFormatter
# Irrelevant to the problem
def save_vids(vid_ids,save_loc):
print('Irrelevant Function')
# Function that generates the initial list of urls to visit
def explore_home(chromedriver_path,chrome_options,caps):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com')
time.sleep(1)
html_source = driver.page_source
driver.close()
parts=html_source.split('{"webCommandMetadata":{"url":"/watch_videos?')[1:]
vids=[]
for part in parts:
part=part[part.find('video_ids=')+10:]
if part.find('\\u')!=-1:
if part.find('"')!=-1:
end=min(part.find('\\u'),part.find('"'))
else:
end=part.find('\\u')
elif part.find('"')!=-1:
end=part.find('"')
else:
print('fuck')
concat_list=part[:end]
vids.extend(concat_list.split('%2C'))
vids=[vid for vid in vids if len(re.findall(r'[0-9]|[a-z]|[A-Z]|_|-',vid))==11 and len(vid)==11]
return vids
# The function that generates chromedrivers and fails to quit if a multiprocessing timeout occurs.
def explore_vid(chromedriver_path,chrome_options,caps,vid,ads,save_loc,l):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com/watch?v='+vid)
time.sleep(2)
sec_html = driver.page_source
soup=BeautifulSoup(sec_html,'lxml')
mydivs = str(soup.findAll("div", {"class": "style-scope ytd-watch-next-secondary-results-renderer"}))
inds=[m.start() for m in re.finditer('ytimg.com/vi/', mydivs)]
rec_vids=['https://www.youtube.com/watch?v='+mydivs[ind+13:ind+24] for ind in inds]
browser_log = driver.get_log('performance')
adInfo=find_ad(browser_log,vid)
if adInfo:
#Check if it is the first time this ad has been seen
adID=adInfo[0]
l.acquire()
try:
if adID in ads:
ads[adID][0].append(adInfo[1])
else:
try:
element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".ytp-ad-button.ytp-ad-visit-advertiser-button.ytp-ad-button-link")))
element.click()
driver.switch_to.window(driver.window_handles[-1])
ad_website_URL=driver.current_url
ad_website_HTML=driver.page_source
clean_text=html2text.html2text(ad_website_HTML)
save_vids(adID,save_loc)
textName=os.path.join(save_loc,adID,'adwebsite.txt')
file = open(textName,"w")
file.write(ad_website_URL)
file.write('\n')
file.write(clean_text)
file.close()
ads[adID]=[[adInfo[1]],ad_website_URL]
except WebDriverException:
print('Button click failed: %s:%s' %(vid,adInfo[0]))
finally:
l.release()
# The quit command for the chrome driver
driver.quit()
return rec_vids
def find_ad(browser_log,vid):
for k in range(len(browser_log)):
if browser_log[k]['message'].find('adunit')!=-1 and browser_log[k]['message'].find(vid)!=-1:
ind=browser_log[k]['message'].find('https://www.youtube.com/get_video_info?html5=1&video_id=')
vid_id=browser_log[k]['message'][ind+56:ind+67]
return (vid_id,time.localtime())
return None
def positive_int(argument):
num=int(argument)
if num<1:
msg="Maximum depth parameter must be a positive number. You entered: %s" %argument
raise argparse.ArgumentTypeError(msg)
return num
def valid_pickle(argument):
file=str(argument)
if not file.endswith('.pickle'):
msg="ad_save_loc must end with .pickle You entered: %s" %file
raise argparse.ArgumentTypeError(msg)
return file
def valid_dir(argument):
directory=str(argument)
if not os.path.isdir(directory):
msg="vid_save_loc must be a valid directory. You entered: %s" %directory
raise argparse.ArgumentTypeError(msg)
return directory
if __name__ == '__main__':
# Argument Parsing
parser = argparse.ArgumentParser(description='Scrapes Youtube ads and advertising company websites. \nUse --restart to restart the scraping from scratch by deleting previous data\nExample Usage: python finalReader.py E:\ads\ads.pickle E:\ads --ncpu 2', formatter_class=RawTextHelpFormatter)
parser.add_argument('ad_save_loc',help='Save Location for Ad Main Dictionary', type=valid_pickle)
parser.add_argument('vid_save_loc',help='Save Location for Ad Videos', type=valid_dir)
parser.add_argument('chromedriver_path', help='Path of the chrome executable', type=str)
parser.add_argument('--restart', help='Restart collection', action="store_true", default=False, dest='restartCollection')
parser.add_argument('--ncpu', nargs='?', help='Number of cores for multiprocessing, 1 by default', default=1, type=int, dest='mpcpu')
parser.add_argument('--timeout',nargs='?', help='For how long the data collection will take place (in seconds), infinite by default', default=float('inf'), type=float, dest='time_limit')
parser.add_argument('--max_depth', nargs='?', help='Depth of Youtube exploration tree', default=1, type=positive_int, dest='search_depth')
args = parser.parse_args()
ad_save_loc=args.ad_save_loc
vid_save_loc=args.vid_save_loc
vid_save_loc=os.path.join(vid_save_loc,'ad_data')
mpcpu=max(args.mpcpu,1)
time_limit=args.time_limit
chromedriver_path=args.chromedriver_path
search_depth=args.search_depth
if not os.path.isdir(vid_save_loc):
os.mkdir(vid_save_loc)
if args.restartCollection:
for the_file in os.listdir(vid_save_loc):
file_path = os.path.join(vid_save_loc, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(e)
if os.path.isfile(ad_save_loc):
os.remove(ad_save_loc)
ads={}
else:
if os.path.isfile(ad_save_loc):
pickle_in = open(ad_save_loc,"rb")
ads = pickle.load(pickle_in)
else:
ads={}
# Chrome Driver Options
chrome_options=Options()
chrome_options.add_argument('--mute-audio')
caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {'performance': 'ALL'}
startTime=time.time()
currentTime=time.time()
# Data Collection Loop - Multiprocessing
while currentTime-startTime<time_limit:
print('Time from start: %s' %str(datetime.timedelta(seconds=currentTime-startTime)))
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
while not rec_vids:
time.sleep(60)
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
m = Manager()
lock = m.Lock()
pool = Pool(processes=mpcpu)
for depth in range(search_depth):
print('Depth %s' %depth)
multiple_results=[pool.apply_async(explore_vid, (chromedriver_path,chrome_options,caps,vid,ads,vid_save_loc,lock)) for vid in rec_vids]
branching_vids=[]
for res in multiple_results:
try:
branching_vids.append(res.get(timeout=30))
if time.time()-startTime<time_limit:
break
except mp.TimeoutError:
print('Timeout')
res_vids=branching_vids.copy()
pickle_out = open(ad_save_loc,"wb")
pickle.dump(ads, pickle_out)
pickle_out.close()
currentTime=time.time()

Resources