Im triying to generate an array of [url_audioenci,url_caratula,titulo_cancion,nombre_artista] to download a list of music from http://los40.com.ar/lista40/. I know how to download media with Requests library, but i cant extract and the links from the page
from bs4 import BeautifulSoup
import requests
# import re
url = 'http://los40.com.ar/m/lista40/'
videos = []
response = requests.get(url)
bs = BeautifulSoup(response.text)
for i in range (1,41):
videos[i]= bs.find_all('datos_camcion_'+i))
# responses= bs.find_all('script', language="javascript", type="text/javascript")
print(videos)
<h3>LISTA DEL 08/06/2019</h3>
<script language="javascript" type="text/javascript">
var datos_cancion_1 = Array();
datos_cancion_1['url_audioenci'] = 'https://recursosweb.prisaradio.com/audios/dest/570005645440.mp4';
datos_cancion_1['url_muzu'] = '';
datos_cancion_1['url_youtube'] = 'https://www.youtube.com/watch?v=XsX3ATc3FbA';
datos_cancion_1['url_itunes'] = '';
datos_cancion_1['posicion'] = '1';
datos_cancion_1['url_caratula'] = 'https://recursosweb.prisaradio.com/fotos/dest/570005645461.jpg';
datos_cancion_1['titulo_cancion'] = 'Boy with luv';
datos_cancion_1['nombre_artista'] = 'BTS;Halsey';
datos_cancion_1['idYes'] = 'BTS';
datos_cancion_1['VidAu'] = 0;
</script>
I expect
videos=[['https://recursosweb.prisaradio.com/audios/dest/570005645440.mp4','https://recursosweb.prisaradio.com/fotos/dest/570005645461.jpg','Boy with luv','BTS;Halsey'].....]
My attempt at filtering the data:
from bs4 import BeautifulSoup
import requests
url = 'http://los40.com.ar/m/lista40/'
videos = []
response = requests.get(url)
bs = BeautifulSoup(response.text, features="html5lib")
scripts = bs.find_all('script', language='javascript', type='text/javascript')
end = len( bs.find_all('script', language='javascript', type='text/javascript') )
start = end - 40
data = []
for i in range( start, end ):
data.append( str(scripts[ i ]) )
print( data[0] )
Output:
<script language="javascript" type="text/javascript">
var datos_cancion_1 = Array();
datos_cancion_1['url_audioenci'] = 'https://recursosweb.prisaradio.com/audios/dest/570005645440.mp4';
datos_cancion_1['url_muzu'] = '';
datos_cancion_1['url_youtube'] = 'https://www.youtube.com/watch?v=XsX3ATc3FbA';
datos_cancion_1['url_itunes'] = '';
datos_cancion_1['posicion'] = '1';
datos_cancion_1['url_caratula'] = 'https://recursosweb.prisaradio.com/fotos/dest/570005645461.jpg';
datos_cancion_1['titulo_cancion'] = 'Boy with luv';
datos_cancion_1['nombre_artista'] = 'BTS;Halsey';
datos_cancion_1['idYes'] = 'BTS';
datos_cancion_1['VidAu'] = 0;
</script>
Data[0:39] contains the top 40 and all the relevant data as strings, but I'm not sure how to extract the information from the strings.
There are some suggestions in this thread via import json or import re that I tried fiddling with, but I couldn't get them to work.
Related
I'm currently learn scraping using scrapy, so I want to scrape data from this: https://www.espn.com/nba/stats/player or https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2
if you go to the link you will see the show more button at the bottom data so im confuse about this because if i scrape that right now im only get 50 of data that, not what i want, so i decided to look up the show more button but its only a href=#
[UPDATE] USING Scrapy+playwright
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
meta= dict(
playwright = True,
playwright_include_page=True,
playwright_page_coroutines = [
PageMethod('wait_for_selector','//a[#class="AnchorLink loadMore__link"]'),
PageMethod('click','//a[#class="AnchorLink loadMore__link"]'),
]
),
callback=self.parse,
)
async def parse(self, response):
page = response.meta['playwright_page']
button = response.meta['playwright_page_coroutines'][0]
if button:
await button.click()
resp = response.body
player_list = sel.xpath(
"//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath(
"//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
await page.wait_for_selector(player_list)
sel = Selector(text=resp)
for player, stat in zip(player_list, stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute = stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position": position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made": fields_goal_made,
"fields_goal_attempted": fields_goal_attempted,
"field_goal_percentage": field_goal_percentage,
"three_point_goal_made": three_point_goal_made,
}
When only using scrapy
def start_requests(self):
yield scrapy.Request(
url='https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2',
callback=self.parse,
)
def parse(self, response):
sel = Selector(text=response.body)
player_list = sel.xpath("//table[#class='Table Table--align-right Table--fixed Table--fixed-left']//tbody//tr")
stats_list = sel.xpath("//div[#class='Table__ScrollerWrapper relative overflow-hidden']/div[#class='Table__Scroller']/table/tbody/tr")
for player,stat in zip(player_list,stats_list):
player_name = player.xpath(".//a/text()").get()
position = stat.xpath(".//td/div/text()").get()
team_name = player.xpath(".//span/text()").get()
game_played = stat.xpath(".//td[2]/text()").get()
minutes_per_minute= stat.xpath(".//td[3]/text()").get()
points_per_game = stat.xpath(".//td[4]/text()").get()
fields_goal_made = stat.xpath(".//td[5]/text()").get()
fields_goal_attempted = stat.xpath(".//td[6]/text()").get()
field_goal_percentage = stat.xpath(".//td[7]/text()").get()
three_point_goal_made = stat.xpath(".//td[8]/text()").get()
yield {
"player_name": player_name,
"player_position":position,
"team_name": team_name,
"game_played": game_played,
"minutes_per_minute": minutes_per_minute,
"points_per_game": points_per_game,
"fields_goal_made":fields_goal_made,
"fields_goal_attempted":fields_goal_attempted,
"field_goal_percentage":field_goal_percentage,
"three_point_goal_made":three_point_goal_made,
}
Am i doing it wrong here? also if you click show more it will show the api like down bellow, i can scrape from that api but for now i want it from html with xpath it self.
I am trying to learn web-scraping on asynchronous javascript-heavy sites. I chose a real estate website to do that. So, I have done the search by hand and came up with the URL as the first step. Here is the url:
CW_url = https://www.cushmanwakefield.com/en/united-states/properties/invest/invest-property-search#q=Los%20angeles&sort=%40propertylastupdateddate%20descending&f:PropertyType=[Office,Warehouse%2FDistribution]&f:Country=[United%20States]&f:StateProvince=[CA]
I then tried to write code to read the page using beautiful soup:
while iterations < 10:
time.sleep(5)
html = driver.execute_script("return document.documentElement.outerHTML")
sel_soup = bs(html, 'html.parser')
forsales = sel_soup.findAll("for sale")
iterations += 1
print (f'iteration {iterations} - forsales: {forsales}')
I also tried using requests-html:
from requests_html import HTMLSession, HTML
from requests_html import AsyncHTMLSession
asession = AsyncHTMLSession()
r = await asession.get(CW_url)
r.html.arender(wait = 5, sleep = 5)
r.text.find('for sale')
But, this gives me -1, which means the text could not be found! The r.text does give me a wall of HTML text, and inside that there seems to be some javascript not run yet!
<script type="text/javascript">
var endpointConfiguration = {
itemUri: "sitecore://web/{34F7EE0A-4405-44D6-BF43-13BC99AE8AEE}?lang=en&ver=4",
siteName: "CushmanWakefield",
restEndpointUri: "/coveo/rest"
};
if (typeof (CoveoForSitecore) !== "undefined") {
CoveoForSitecore.SearchEndpoint.configureSitecoreEndpoint(endpointConfiguration);
CoveoForSitecore.version = "5.0.788.5";
var context = document.getElementById("coveo3a949f41");
if (!!context) {
CoveoForSitecore.Context.configureContext(context);
}
}
</script>
I thought the fact that the url contains all the search criteria means that the site makes the fetch request, returns the data, and generate the HTML. Apparently not! So, what am I doing wrong and how to deal with this or similar sites? Ideally, one would replace the search criteria in the CW_url and let the code retrieve and store the data
i'm receiving a variable from python script in node.js.
I read this variable with :
subprocess.stdout.on('data', (data) => {
dataString += data.toString();
})
but i can't use "dataString" outside of "subprocess.stdout.on", it's just empty.
I need the data received from python to use it elsewhere.
any solution for this one please?
node.js file :
const path = require('path')
const {spawn} = require('child_process')
const fs = require('fs')
var img = "./img/facture1.jpg"
const data64 = fs.readFileSync(img, 'base64')
const fetch = require("node-fetch")
var dataString = '';
var r = spawn('python', [
"-u",
path.join(__dirname, 'my_script.py'),
img,
]);
r.stdin.write(data64);
r.stdin.end();
function runScript(){
return r
}
const subprocess = runScript()
subprocess.stdout.on('data', (data) => {
dataString += data.toString();
})
subprocess.stdout.on('end', () =>{
dataStr = dataString;
console.log("\n i'm from python",dataString);
});
python file :
#!/usr/bin/python
import io
import os
import sys, json
import base64
from typing import IO
from PIL import Image
import cv2
import numpy as np
import PIL.Image
import pytesseract
from pytesseract import image_to_string
def read_in():
lines = sys.stdin.readlines()
return lines
def stringToRGB(base64_string):
imgdata = base64.b64decode(str(base64_string))
image = Image.open(io.BytesIO(imgdata))
return cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)
def search_string_in_file(file_name, string_to_search1,string_to_search2):
line_number = 0
list_of_results = []
with open(file_name, 'r') as read_obj:
for line in read_obj:
line_number += 1
if string_to_search1 in line:
list_of_results.append(line.rstrip())
if string_to_search2 in line:
list_of_results.append(line.rstrip())
return list_of_results
def main():
lines = read_in()
window_name = 'Image'
image = stringToRGB(lines)
imS = cv2.resize(image, (960, 700))
pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
image_to_text = pytesseract.image_to_string(image, lang='eng')
name = sys.argv[1]
name = name[2:-4]
f = open('%s.txt' % name, "w")
f.write(image_to_text)
f.close()
matched_lines = search_string_in_file('%s.txt' % name, 'Total', 'A PAYER')
for elem in matched_lines:
elem = elem[14:18]
print("\n",elem)
if __name__ == "__main__":
main()
Hi I am new to python and REST API,
I am getting 415 error while trying to run a query in cms using requests.post
I am not able to pass content-type and Accept along with the logon token.
I am able to run this in talend along with these 2 headers.
Can you please help me in how to add these 2 headers in requests.post at the end.
Below is my code
import requests
from lxml import etree
import xml.etree.ElementTree as ET
import pandas as pd
import openpyxl as x
from bs4 import BeautifulSoup
import xmltodict
protocol='http'
host='HOST'
port='6405'
content_type='application/xml'
base_url = protocol + '://' + host + ':' + port
bip_url = base_url + '/biprws'
webi_url = bip_url + '/raylight/v1'
sl_url = bip_url + '/sl/v1'
headers_auth = {
'Content-Type' : content_type,'Accept':'application/xml'
}
headers = {
}
username = 'user'
password = 'pass'
auth = requests.get(bip_url + '/logon/long', headers=headers)
root = etree.fromstring(auth.text)
root[3].text = username
root[0].text = password
etree.tostring(root)
send = requests.post(bip_url + '/logon/long',
headers=headers_auth,
data=etree.tostring(root))
tokenresp = etree.fromstring(send.content)
headers['X-SAP-LogonToken'] = tokenresp[3][0][0].text
folder_get = requests.get(bip_url + '/v1/cmsquery', headers=headers)
folder_root = etree.fromstring(folder_get.text)
Query_var = 'SELECT SI_ID,SI_NAME FROM CI_INFOOBJECTS WHERE SI_ANCESTOR = 12141'
folder_root[0].text = Query_var
data1 = etree.tostring(folder_root)
folder_post = requests.post(bip_url + '/v1/cmsquery', headers = headers, data = data1)
folder_post.status_code
I think 415 means that you're passing a content type that the API doesn't accept. You need to configure your headers correctly. Try this:
headers = {
'Content-Type' : 'application/xml'
}
auth = requests.get(bip_url + 'logon/long', headers=headers)
print(auth.status_code)
It looks like your problem is that you set headers to a blank dictionary.
I'm trying to learn how to use the YahooApi, but when getting the data from the website, it gives me an internal server error. I have tried every combination of league or leagues data or even general game data, but everything is giving me an internal server error. I have attached my code below and any help I could receive would be very helpful.
import json
import time
import webbrowser
import pandas as pd
from pandas.io.json import json_normalize
from rauth import OAuth1Service
from rauth.utils import parse_utf8_qsl
credentials_file = open('auth.json')
credentials = json.load(credentials_file)
credentials_file.close()
oauth = OAuth1Service(consumer_key = 'key',
consumer_secret = 'secret',
name = "yahoo",
request_token_url = "https://api.login.yahoo.com/oauth/v2/get_request_token",
access_token_url = "https://api.login.yahoo.com/oauth/v2/get_token",
authorize_url = "https://api.login.yahoo.com/oauth/v2/request_auth",
base_url = "http://fantasysports.yahooapis.com/")
request_token, request_token_secret = oauth.get_request_token(params={"oauth_callback": "oob"})
authorize_url = oauth.get_authorize_url(request_token)
webbrowser.open(authorize_url)
verify = input('Enter code: ')
raw_access = oauth.get_raw_access_token(request_token,
request_token_secret,
params={"oauth_verifier": verify})
parsed_access_token = parse_utf8_qsl(raw_access.content)
access_token = (parsed_access_token['oauth_token'],
parsed_access_token['oauth_token_secret'])
start_time = time.time()
end_time = start_time + 3600
credentials['access_token'] = parsed_access_token['oauth_token']
credentials['access_token_secret'] = parsed_access_token['oauth_token_secret']
tokens = (credentials['access_token'], credentials['access_token_secret'])
s = oauth.get_session(tokens)
r = s.get('https://fantasysports.yahooapis.com/fantasy/v2/leagues;league_keys=nba.l.60379', params={'format': 'json'})
print(r.status_code)
r.json()
And that prints {u'error': {u'description': u'Internal server error', u'lang': u'en-US'}}
seems like this issues stems from Yahoo's side. One user reported trying OAuth2 authentication which seemed to work fine.
https://forums.yahoo.net/t5/Help-with-Fantasy-Baseball/Receiving-500-quot-Internal-Server-Error-quot-from-Yahoo-Fantasy/td-p/341427/page/4