Scrape livechat till the end of stream with selenium - python-3.x

I'm trying to scrape the youtube livechat. I need to save all old and incoming messages. For this purpose i use a css selector and a infinite loop to accomplish this, however this results in duplicate entries and previous messages being omitted. What is the proper way to do this? The target url is the first commandline argument.
from selenium import webdriver
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import os,re,sys
def parseyt():
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--executable_path="chromedriver.exe"')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-extensions')
chrome_bin = os.getenv('GOOGLE_CHROME_SHIM', None)
is_local = os.getenv('IS_LOCAL', None)
chromedriver_path = r'chromedriver.exe'
service_log_path = "{}/chromedriver.log".format('\.')
service_args = ['"--verbose", "--log-path=scrape.log"']
chromedriver_path = 'chromedriver.exe'
chrome_options.binary_location = r'C:\Program Files (x86)\Chromium\Application\chrome.exe'
browser = webdriver.Chrome(executable_path=chromedriver_path,chrome_options=chrome_options,service_args=service_args)
url = sys.argv[1]
url = url.replace(r'watch?',r'live_chat?')
print(url)
browser.get(url)
browser.implicitly_wait(1)
while True:
innerHTML = browser.execute_script("return document.body.innerHTML")
chats = []
for chat in browser.find_elements_by_css_selector('yt-live-chat-text-message-renderer'):
author_name = chat.find_element_by_css_selector("#author-name").get_attribute('innerHTML')
message = chat.find_element_by_css_selector("#message").get_attribute('innerHTML')
author_name_encoded = author_name.encode('utf-8').strip()
message_encoded = message.encode('utf-8').strip()
print(message+" "+author_name+"\n")
browser.quit()
return chats

It is better to use YouTube API instead.

Related

How to get post links from the whole page using BeautifulSoup Selenium

I'm having trouble trying to web scraping using BeautifulSoup and Selenium. The problem I have is i want to try pulling data from pages 1-20. But somehow the data that was successfully pulled was only up to page 10. It is possible that the number of the last page limit that I would take could be more than 20, but the results of the code I made could only pull 10 pages. Does anyone have an understanding for the problem to be able to pull a lot of data without page limit?
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
apartment_urls = []
try:
for page in range(1,20):
print(f"Extraction Page# {page}")
page="https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan&hlmn=" + str(page)
driver.get(page)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a[href]')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
apartment_urls.append(get_url)
except:
print("Good Bye!")
This is the output of the code. When pages 10,11,12 and so on I can't get the data
Now, pagination is working fine without page limit.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
if next_button:
button = next_button.click()
time.sleep(3)
else:
break
If you would prefer to use: webdriverManager
Alternative solution: As the next page url isn't dynamic, It's also working fine.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
# next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
# if next_button:
# button = next_button.click()
# time.sleep(3)
next_page = soup.select_one('li.next > a ')
if next_page:
next_page = f'https://www.99.co{next_page}'
else:
break

Scraping infomation from a div tag inside of a div tag

I have been trying to web scrape from this website: https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players
I want to get specific ratings but i get nothing when i execute this code:
from bs4 import BeautifulSoup
import requests
result = requests.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
src = result.content
soup = BeautifulSoup(src, 'lxml')
match = soup.find('div', class_='css-gm45eu')
print(match)
output: None
How can I scrape what is in that class?
Try to use Selenium:
from selenium import webdriver
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe')
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)
Output:
['1.189', '1.109', '1.098', '1.031', '1.028', '1.005', '0.990', '0.981', '0.967', '0.936', '0.904', '0.846', '0.841', '0.840', '0.836', '0.809', '0.759', '0.726']
Download chromedriver.exe:
https://chromedriver.chromium.org/downloads
if you don't want a chrome window to open while running, use this code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe', options=options)
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)

How to extract name and links from a given website - python

For below mentioned website, I am trying to find the name and its corresponding link from that site. But not able to pass/get the data at all.
Using BeautifulSoup
from bs4 import BeautifulSoup
import requests
source = requests.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
soup = BeautifulSoup(source.text, 'html.parser')
mains = soup.find_all("div", {"class": "list-container-wrapper"})
name = []
lnks = []
for main in mains:
name.append(main.find("a").text)
lnks.append(main.find("a").get('href'))
Using Selenium webdriver
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"chromedriver_win32\chromedriver.exe")
driver.get("https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0")
lnks = []
name = []
for a in driver.find_elements_by_class_name('ng-star-inserted'):
link = a.get_attribute('href')
lnks.append(link)
nm = driver.find_element_by_css_selector("#list-item-0 > div > h2 > a").text
name.append(nm)
I have tried with both 2 above methods.
Example:
name = ['Friday Night Flicks Drive-In at the Roadium', 'Open: Butterfly Pavilion and Nature Gardens']
lnks = ['https://mommypoppins.com/los-angeles-kids/event/in-person/friday-night-flicks-drive-in-at-the-roadium','https://mommypoppins.com/los-angeles-kids/event/in-person/open-butterfly-pavilion-and-nature-gardens']
Here's solution for webdriver:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
elements = driver.find_elements(By.XPATH, "//a[#angularticsaction='expanded-detail']")
attributes = [{el.text: el.get_attribute('href')} for el in elements]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with webdriver and bs4:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
mains = soup.find_all("a", {"angularticsaction": "expanded-detail"})
attributes = [{el.text: el.get('href')} for el in mains]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with requests:
import requests
url = "https://mommypoppins.com"
response = requests.get(f"{url}/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all").json()
attributes = [{r.get('node_title'): f"{url}{r['node'][r['nid']]['node_url']}"} for r in response['results']]
print(attributes)
print(len(attributes))
cheers!
The website is loaded dynamically, therefore requests won't support it. However, the data is available in JSON format via sending a GET request to:
https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all.
There's no need for BeautifulSoup or Selenium, using merely requests would work, which will make your code much faster.
import requests
URL = "https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all"
BASE_URL = "https://mommypoppins.com"
response = requests.get(URL).json()
names = []
links = []
for json_data in response["results"]:
data = json_data["node"][json_data["nid"]]
names.append(data["title"])
links.append(BASE_URL + data["node_url"])

How to scrape links from faceit

I am trying to scrape code from a faceit room, this is what i've tried but it doesn't work.
Any help is much appreciated!
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.faceit.com/en/csgo/room/1-8d6729b5-cfeb-4059-8894-3b07e04e76b2')
soup = BeautifulSoup(r.content, 'html.parser')
extracted_link = soup.find_all('href', class_='list-unstyled')
print(extracted_link)
Example Link : https://www.faceit.com/en/csgo/room/1-8d6729b5-cfeb-4059-8894-3b07e04e76b2
Example Link Extracted : https://demos-europe-west2.faceit-cdn.net/csgo/f9eadb47-aea5-4672-9499-4f457c7d28bd.dem.gz
Example : https://paste.pics/AQBQY
All of the contents of the page is loaded dynamically, which means that BeautifulSoup won't see it. So you actually might be better off using selenium with a webdriver in headless mode.
For example:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
url = "https://www.faceit.com/en/csgo/room/1-8d6729b5-cfeb-4059-8894-3b07e04e76b2"
driver.get(url)
time.sleep(2)
element = driver.find_element_by_css_selector('.match-vs .btn-default')
print(element.get_attribute("href"))
Output:
https://demos-europe-west2.faceit-cdn.net/csgo/f9eadb47-aea5-4672-9499-4f457c7d28bd.dem.gz
You can also do it using only requests:
import requests as r
room_id = '1-8d6729b5-cfeb-4059-8894-3b07e04e76b2'
link = 'https://api.faceit.com/match/v2/match/'+room_id
res = r.get(link)
data = res.json()
extracted_links = data['payload']['demoURLs']
print(extracted_links)
The code probes their API to get all the data at once as JSON, then just extract the needed information.

Python and Selenium Get list of who liked post in instagram

i'm new in python, so plz don't bite me =)
The problem is next, i can't understand how to scroll the modal window in instagram using python and selenium. My task is get all people who liked the post. Now i stuck after the modal is opening a
from lxml import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-setuid-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('--headless')
driver = webdriver.Chrome('/usr/bin/chromedriver',
chrome_options=chrome_options)
driver.get('https://www.instagram.com/p/B2Y2nwpCiYa/?igshid=sv6ovbyg07hx')
driver.implicitly_wait(0.7)
for elem in driver.find_elements_by_xpath(
'.//span[#class = "glyphsSpriteGrey_Close u-__7"]'):
elem.click()
print('close login')
button = driver.find_elements_by_xpath('.//button[#class = "sqdOP
yWX7d _8A5w5 "]')
button[0].click()
driver.implicitly_wait(60)
#user_loc = By.CSS_SELECTOR,
'div.Igw0E.rBNOH.eGOV_.ybXk5._4EzTm.XfCBB.HVWg4'
user_loc = By.CSS_SELECTOR, 'button.sqdOP.L3NKy'
#user_loc = By.CSS_SELECTOR, 'div._7UhW9.xLCgt.MMzan.KV-D4.fDxYl'
# Wait for first XHR complete
wait(driver, 20).until(EC.visibility_of_element_located(user_loc))
# Get current length of user list
current_len = len(driver.find_elements(*user_loc))
print(current_len)
while True:
driver.find_element(*user_loc).send_keys(Keys.END)
try:
wait(driver, 10).until(lambda x:
len(driver.find_elements(*user_loc)) > current_len)
current_len = len(driver.find_elements(*user_loc))
# # Return full list of songs
except TimeoutException:
user_list = [user for user in driver.find_elements(*user_loc)]
break
driver.get_screenshot_as_file('test.png')
print(len(user_list))

Resources