Scrape image's metadata from Facebook public posts - python-3.x

This is a follow-up question in my quest to get some data from Facebook public posts. I'm trying to collect images metadata this time (image's url). Link posts work fine but some posts return empty data. I used the same approach suggested in answers to my previous question but it doesn't work for the example below. Will appreciate suggestions!
link = "https://www.facebook.com/228735667216/posts/10151653129902217"
res = requests.get(link,headers={'User-Agent':'Mozilla/5.0'})
comment = res.text.replace("-->", "").replace("<!--", "")
soup = BeautifulSoup(comment, "lxml")
image = soup.find("div", class_="uiScaledImageContainer _517g")
img = image.find("img", class_="scaledImageFitWidth img")
href = img["src"]
print(href)

To log in using requests is not that easy so I intentionally skipped that library. You can try using only selenium or selenium in combination with BeautifulSoup to do the doing.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
url = "https://www.facebook.com/228735667216/posts/10156284868312217"
chrome_options = webdriver.ChromeOptions()
#This is how you can make the browser headless
chrome_options.add_argument("--headless")
#The following line controls the notification popping up right after login
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
driver.find_element_by_id("email").send_keys("your_username")
driver.find_element_by_id("pass").send_keys("your_password",Keys.RETURN)
driver.get(url)
soup = BeautifulSoup(driver.page_source, "lxml")
for img in soup.find_all(class_="scaledImageFitWidth"):
print(img.get("src"))
driver.quit()
Output are like (partial):
https://external.fdac17-1.fna.fbcdn.net/safe_image.php?d=AQBjBuP0TBYabtnO&w=540&h=282&url=https%3A%2F%2Fs3.amazonaws.com%2Fprod-cust-photo-posts-jfaikqealaka%2F3065-6e4c325b07b921fdefed4dd727881f8d.jpg&cfs=1&upscale=1&fallback=news_d_placeholder_publisher&_nc_hash=AQCVKXMSqvNiHZik
https://external.fdac17-1.fna.fbcdn.net/safe_image.php?d=AQCJ6RFOF4dY2xTn&w=100&h=100&url=https%3A%2F%2Fcdn.images.express.co.uk%2Fimg%2Fdynamic%2F106%2F750x445%2F1046936.jpg&cfs=1&upscale=1&fallback=news_d_placeholder_publisher_square&_nc_hash=AQAyFxRaZTGV47Se

Related

Scraping infomation from a div tag inside of a div tag

I have been trying to web scrape from this website: https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players
I want to get specific ratings but i get nothing when i execute this code:
from bs4 import BeautifulSoup
import requests
result = requests.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
src = result.content
soup = BeautifulSoup(src, 'lxml')
match = soup.find('div', class_='css-gm45eu')
print(match)
output: None
How can I scrape what is in that class?
Try to use Selenium:
from selenium import webdriver
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe')
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)
Output:
['1.189', '1.109', '1.098', '1.031', '1.028', '1.005', '0.990', '0.981', '0.967', '0.936', '0.904', '0.846', '0.841', '0.840', '0.836', '0.809', '0.759', '0.726']
Download chromedriver.exe:
https://chromedriver.chromium.org/downloads
if you don't want a chrome window to open while running, use this code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome('PATH_TO --> chromedriver.exe', options=options)
driver.get("https://octane.gg/events/e83e-rlcs-x-championship-europe/stats/players")
ratings = driver.find_elements_by_xpath('//div[#class="css-gm45eu"]')
ratings_list = []
for p in range(len(ratings)):
ratings_list.append(ratings[p].text)
print(ratings_list)

How to extract name and links from a given website - python

For below mentioned website, I am trying to find the name and its corresponding link from that site. But not able to pass/get the data at all.
Using BeautifulSoup
from bs4 import BeautifulSoup
import requests
source = requests.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
soup = BeautifulSoup(source.text, 'html.parser')
mains = soup.find_all("div", {"class": "list-container-wrapper"})
name = []
lnks = []
for main in mains:
name.append(main.find("a").text)
lnks.append(main.find("a").get('href'))
Using Selenium webdriver
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"chromedriver_win32\chromedriver.exe")
driver.get("https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0")
lnks = []
name = []
for a in driver.find_elements_by_class_name('ng-star-inserted'):
link = a.get_attribute('href')
lnks.append(link)
nm = driver.find_element_by_css_selector("#list-item-0 > div > h2 > a").text
name.append(nm)
I have tried with both 2 above methods.
Example:
name = ['Friday Night Flicks Drive-In at the Roadium', 'Open: Butterfly Pavilion and Nature Gardens']
lnks = ['https://mommypoppins.com/los-angeles-kids/event/in-person/friday-night-flicks-drive-in-at-the-roadium','https://mommypoppins.com/los-angeles-kids/event/in-person/open-butterfly-pavilion-and-nature-gardens']
Here's solution for webdriver:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
elements = driver.find_elements(By.XPATH, "//a[#angularticsaction='expanded-detail']")
attributes = [{el.text: el.get_attribute('href')} for el in elements]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with webdriver and bs4:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
mains = soup.find_all("a", {"angularticsaction": "expanded-detail"})
attributes = [{el.text: el.get('href')} for el in mains]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with requests:
import requests
url = "https://mommypoppins.com"
response = requests.get(f"{url}/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all").json()
attributes = [{r.get('node_title'): f"{url}{r['node'][r['nid']]['node_url']}"} for r in response['results']]
print(attributes)
print(len(attributes))
cheers!
The website is loaded dynamically, therefore requests won't support it. However, the data is available in JSON format via sending a GET request to:
https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all.
There's no need for BeautifulSoup or Selenium, using merely requests would work, which will make your code much faster.
import requests
URL = "https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all"
BASE_URL = "https://mommypoppins.com"
response = requests.get(URL).json()
names = []
links = []
for json_data in response["results"]:
data = json_data["node"][json_data["nid"]]
names.append(data["title"])
links.append(BASE_URL + data["node_url"])

How to scrape links from faceit

I am trying to scrape code from a faceit room, this is what i've tried but it doesn't work.
Any help is much appreciated!
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.faceit.com/en/csgo/room/1-8d6729b5-cfeb-4059-8894-3b07e04e76b2')
soup = BeautifulSoup(r.content, 'html.parser')
extracted_link = soup.find_all('href', class_='list-unstyled')
print(extracted_link)
Example Link : https://www.faceit.com/en/csgo/room/1-8d6729b5-cfeb-4059-8894-3b07e04e76b2
Example Link Extracted : https://demos-europe-west2.faceit-cdn.net/csgo/f9eadb47-aea5-4672-9499-4f457c7d28bd.dem.gz
Example : https://paste.pics/AQBQY
All of the contents of the page is loaded dynamically, which means that BeautifulSoup won't see it. So you actually might be better off using selenium with a webdriver in headless mode.
For example:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
url = "https://www.faceit.com/en/csgo/room/1-8d6729b5-cfeb-4059-8894-3b07e04e76b2"
driver.get(url)
time.sleep(2)
element = driver.find_element_by_css_selector('.match-vs .btn-default')
print(element.get_attribute("href"))
Output:
https://demos-europe-west2.faceit-cdn.net/csgo/f9eadb47-aea5-4672-9499-4f457c7d28bd.dem.gz
You can also do it using only requests:
import requests as r
room_id = '1-8d6729b5-cfeb-4059-8894-3b07e04e76b2'
link = 'https://api.faceit.com/match/v2/match/'+room_id
res = r.get(link)
data = res.json()
extracted_links = data['payload']['demoURLs']
print(extracted_links)
The code probes their API to get all the data at once as JSON, then just extract the needed information.

Reading multiple URLs from a text file, processing each web page, and scraping the content inside

I have a .txt file with a list of multiple URLs. My purpose is to open this .txt file, access each URL in each line, scrape the content inside each URL, and append the content with list of multiple URLs in the txt file to the "draft.csv" file.
When I tried to run other codes, the recommended request result shows "Please turn on JavaScript and refresh the page", so I intended to use Selenium instead to get this resolved. I am able to fetch all the pages as wanted, but unable to see the desired content in each link.
Below is list of multiple URLs for example:
http://example.com/2267/15175/index.html
http://example.com/2267/16796/index.html
http://example.com/2267/17895/index.html
This is my current code using Selenium and Requests.
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
import pandas as pd
import urllib.request
import requests
frame =[]
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options = chrome_options)
with open("draft.txt", "r") as file:
for line in file:
url = line.rstrip("\n")
print(url)
driver.get(url)
html = etree.HTML(driver.page_source)
allurl = requests.get(url)
htmltext = allurl.text
extract_link = html.xpath('//span[#id="my_two"]/table/tbody/tr/td/table[2]')
for i in extract_link:
link = i.xpath('./tbody/tr/td/div/p/a/#href')
content = 'http://example.com'+ link[0]
frame.append({
'content': content,
})
dfs = pd.DataFrame(frame)
dfs.to_csv('draft.csv',index=False,encoding='utf-8-sig')
Thank you in advance for helping me with this!
You must load selenium inside a for loop, and you can use bs4 for scraping:
from selenium import webdriver
from bs4 import BeautifulSoup
f = open("urls.txt")
urls = [url.strip() for url in f.readlines()]
For url in urls:
driver.get(url)
...
html = driver.page_source
soup = BeautifulSoup(html)
Information = soup.find('title')
Url = url
...
driver.quit()

Get geolocation from webpage by scraping using Selenium

I try to scrape this page :
The goal is to collect the latitude and the longitude. However I see that the HTML content don't change after any submit in "Adresse" case, and I don't know if this is the problem of my "empty list".
My script :
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.coordonnees-gps.fr/"
chrome_path = r"C:\Users\jbourbon\Desktop\chromedriver_win32 (1)\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
driver.maximize_window()
driver.get(url)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
g_data = soup.findAll("div", {"class": "col-md-9"})
latitude = []
longitude = []
for item in g_data:
latitude.append(item.contents[0].findAll("input", {"id": "latitude"}))
longitude.append(item.contents[0].findAll("input", {"id": "longitude"}))
print(latitude)
print(longitude)
And here, what I have with my list
Ty :)
There's nothing wrong with what you're doing, the problem is that when you open the link using selenium or requests the geolocation is not available instantly, it's available after few seconds (aaf9ec0.js adds it to the html dynamically, so request won't work anyway), also seems like input#latitude is also not giving the values, you can get it from div#info_window.
I've modified the code a bit, it should get you the lat, long, works for me:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
url = "https://www.coordonnees-gps.fr/"
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
time.sleep(2) # wait for geo location to become ready
# no need to fetch it again using requests, we've already done it using selenium above
#r = requests.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
g_data = soup.findAll("div", {"id": "info_window"})[0]
#extract latitude, longitutde
latitude, longitude = re.findall(r'Latitude :</strong> ([\.\d]*) \| <strong>Longitude :</strong> ([\.\d]*)<br/>', str(g_data))[0]
print(latitude)
print(longitude)
Output
25.594095
85.137565
My best guess is that you have to enable GeoLocation for the chromedriver instance. See this answer for a lead on how to do that.

Resources