BeautifulSoup - trying to parse website but unsure how to parse json script? - python-3.x

As a fun side project I have been trying to parse a website for a random fact of the day.
I decided to try my hand at this today with BeautifulSoup4 and urllib3. Sadly however I am unsure how to dive any deeper into a script element than I already have gotten.
This is my current level of output:
{
"#context": "http://schema.org",
"#type": "Article",
"headline": "Fact of the Day: 51 Facts Of the Day for 3/19/2019 ←FACTSlides→",
"image": "https://www.FACTSlides.com/imgs/ishots/8224.png",
"author": "Luke Reiner",
"genre": "facts",
"publisher": {
"#type": "Organization",
"name": "FACTSlides",
"logo": {
"#type": "ImageObject",
"url": "https:\/\/www.factslides.com\/imgs\/logo.png"
}
},
"url": "https://www.factslides.com/s-Fact-Of-The-Day",
"mainEntityOfPage": "https://www.factslides.com/s-Fact-Of-The-Day",
"datePublished": "2019-03-19",
"dateCreated": "2019-03-19",
"dateModified": "2019-03-19",
"description": "Description.",
"articleBody": "Article clutter here."
}
The facts themselves are stored under the articleBody and are not delimited, I was going to use '. ' as the Delimiter if I got that far.
This is the code I have so far:
""" Get a random fact. """
import argparse
import json
import urllib3
from bs4 import BeautifulSoup
PARAMETERS = {
"u": ["url", "passes in a url.", "1"],
}
PARSER = argparse.ArgumentParser(
description="Arguments to parse a url."
)
HTTP = urllib3.PoolManager()
def __load_args(parser, cfg_list):
""" Loads the passed arguments. """
for cfg_key in cfg_list:
if len(cfg_list[cfg_key]) > 3:
parser.add_argument(
"-" + cfg_key,
"--" + cfg_list[cfg_key][0],
help=cfg_list[cfg_key][1],
action=cfg_list[cfg_key][2],
nargs=cfg_list[cfg_key][3],
)
else:
parser.add_argument(
"-" + cfg_key,
"--" + cfg_list[cfg_key][0],
default=None,
help=cfg_list[cfg_key][1],
)
def parse_args(parser, section_list=[]):
""" Parses the loaded arguments. """
for section in section_list:
__load_args(parser, section)
return parser.parse_args()
ARGS = parse_args(PARSER, [PARAMETERS])
RESPONSE = HTTP.request('GET', ARGS.url)
SOUP = BeautifulSoup(RESPONSE.data, features="html.parser")
SOUP_SCRIPT = SOUP.find_all("script")
JS_TEXT = SOUP.find('script', type='application/ld+json').text
print(JS_TEXT)
Any help would be appreciated.
NOTE: The url I was parsing for the facts is here.

As long as your json is text/string, you can use json.loads() to read that in:
import json
JS_TEXT = '''{
"#context": "http://schema.org",
"#type": "Article",
"headline": "Fact of the Day: 51 Facts Of the Day for 3/19/2019 ←FACTSlides→",
"image": "https://www.FACTSlides.com/imgs/ishots/8224.png",
"author": "Luke Reiner",
"genre": "facts",
"publisher": {
"#type": "Organization",
"name": "FACTSlides",
"logo": {
"#type": "ImageObject",
"url": "https:\/\/www.factslides.com\/imgs\/logo.png"
}
},
"url": "https://www.factslides.com/s-Fact-Of-The-Day",
"mainEntityOfPage": "https://www.factslides.com/s-Fact-Of-The-Day",
"datePublished": "2019-03-19",
"dateCreated": "2019-03-19",
"dateModified": "2019-03-19",
"description": "Description.",
"articleBody": "Article clutter here."
}'''
jsonObj = json.loads(JS_TEXT)

I solved the issue with the help of #chitown88.
Here is the functioning code here:
""" Get a random fact. """
import argparse
import random
import json
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
PARAMETERS = {
"u": ["url", "passes in a url.", "1"],
}
PARSER = argparse.ArgumentParser(
description="Arguments to parse a url."
)
HTTP = urllib3.PoolManager()
def __load_args(parser, cfg_list):
""" Loads the passed arguments. """
for cfg_key in cfg_list:
if len(cfg_list[cfg_key]) > 3:
parser.add_argument(
"-" + cfg_key,
"--" + cfg_list[cfg_key][0],
help=cfg_list[cfg_key][1],
action=cfg_list[cfg_key][2],
nargs=cfg_list[cfg_key][3],
)
else:
parser.add_argument(
"-" + cfg_key,
"--" + cfg_list[cfg_key][0],
default=None,
help=cfg_list[cfg_key][1],
)
def parse_args(parser, section_list=[]):
""" Parses the loaded arguments. """
for section in section_list:
__load_args(parser, section)
return parser.parse_args()
ARGS = parse_args(PARSER, [PARAMETERS])
RESPONSE = HTTP.request('GET', ARGS.url)
SOUP = BeautifulSoup(RESPONSE.data, features="html.parser")
SOUP_SCRIPT = SOUP.find_all("script")
JS_TEXT = SOUP.find('script', type='application/ld+json').text
JSON_OBJ = json.loads(JS_TEXT)
LIST_TEST = []
# print(JSON_OBJ['articleBody'])
for item in JSON_OBJ['articleBody'].split('. '):
LIST_TEST.append(item.strip())
print(random.choice(LIST_TEST) + ".")
I would like to note that my delimeter is not the best, as some of the 'facts' are spread across two sentences.

Related

search a list in api/ format url seach for api

I want to generate 6 random numbers for pokemon api ID.
Put in list.
Then use the 6 numbers in url search.
The url doesn't recognise the list.
I need to convert the list to numbers. I'm not sure how to format them into the url.
import random
import requests
pokemon_ID = []
# pokemon_ID_add = str(pokemon_ID)[1:-1]
# pokemon_ID2 = str(pokemon_ID)[1:-1]
for i in range(0,6):
number = random.randint(1 ,151)
while i in pokemon_ID:
number = random.randint(1, 151)
pokemon_ID.append(number)
url = 'https://pokeapi.co/api/v2/pokemon/{}/'.format(pokemon_ID)
response = requests.get(url)
pokemon = response.json()
print(pokemon)
You can use loop to iterate over random IDs and store the result to a list:
import json
import random
import requests
url = "https://pokeapi.co/api/v2/pokemon/{}/"
random_pokemon_ids = [random.randint(1, 151) for i in range(6)]
result = []
for id_ in random_pokemon_ids:
pokemon = requests.get(url.format(id_)).json()
result.append(pokemon)
# pretty print the result:
print(json.dumps(result, indent=4))
Prints:
[
{
"abilities": [
{
"ability": {
"name": "rock-head",
"url": "https://pokeapi.co/api/v2/ability/69/"
},
"is_hidden": false,
"slot": 1
},
{
"ability": {
"name": "lightning-rod",
"url": "https://pokeapi.co/api/v2/ability/31/"
},
"is_hidden": false,
"slot": 2
},
{
"ability": {
"name": "battle-armor",
"url": "https://pokeapi.co/api/v2/ability/4/"
},
"is_hidden": true,
"slot": 3
}
],
"base_experience": 64,
"forms": [
{
"name": "cubone",
"url": "https://pokeapi.co/api/v2/pokemon-form/104/"
}
],
...

Flask API - create nested json response group by field single table

I have a basic API setup to do a basic Post and Get from a single table. I want to create a nested array though grouping by force_element_type
model.py
from db import db
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy import text as sa_text
class ForceElementModel(db.Model):
__tablename__ = 'force_element'
__table_args__ = {'schema': 'force_element'}
force_element_id = db.Column(UUID(as_uuid=True), primary_key=True, server_default=sa_text("uuid_generate_v4()"))
name = db.Column(db.String(100), nullable=False)
force_element_type = db.Column(db.String(20), nullable=False)
def __init__(self, name, force_element_type):
self.name = name
self.force_element_type = force_element_type
def json(self):
return {'name': self.name, 'force_element_type': self.force_element_type}
#classmethod
def find_by_name(cls, name):
return cls.query.filter_by(name=name).first() # simple TOP 1 select
def save_to_db(self): # Upserting data
db.session.add(self)
db.session.commit() # Balla
def delete_from_db(self):
db.session.delete(self)
db.session.commit()
resource.py
from flask_restful import Resource, reqparse
#from flask_jwt import jwt_required
from models.force_element import ForceElementModel
class ForceElement(Resource):
parser = reqparse.RequestParser() # only allow price changes, no name changes allowed
parser.add_argument('force_element_type', type=str, required=True, help='This field cannot be left blank')
##jwt_required()
def post(self, name):
if ForceElementModel.find_by_name(name):
return {'message': "An Force Element with name '{}' already exists.".format(name)}, 400
data = ForceElement.parser.parse_args()
force_element = ForceElementModel(name, data['force_element_type'])
try:
force_element.save_to_db()
except:
return {"message": "An error occurred inserting the item."}, 500
return force_element.json(), 201
class ForceElementList(Resource):
##jwt_required()
def get(self):
return {'force_elements': [force_element.json() for force_element in ForceElementModel.query.all()]}
class ForceElementType(Resource):
##jwt_required()
def get(self):
The GET endpoint using ForceElementList returns
{
"force_elements": [
{
"name": "San Antonio",
"force_element_type": "ship"
},
{
"name": "Nimitz",
"force_element_type": "ship"
},
{
"name": "Nimitz- Starboard",
"force_element_type": "Crew"
},
{
"name": "Nimitz- Port",
"force_element_type": "Crew"
}
]
}
I don't know how to group by force_element_type and return
[
"ship": [
{
"name": "San Antonio",
"force_element_id": "xxx1"
},
{
"name": "Nimitz",
"force_element_id": "xxx2"
}],
"crew": [
{
"name": "Nimitz- Starboard",
"force_element_id": "yyy1"
},
{
"name": "Nimitz- Port",
"force_element_id": "yyy2"
}
]
]
How do I create this separate andpoint?
OK I got there, here is how I did it. Is there a better way?
Lesson one use an online parser to check the json format this is what I was actually aiming for and the square braket at then start had me scratching my head for a while
{
"ship": [
{
"name": "San Antonio",
"force_element_id": "xxx1"
},
{
"name": "Nimitz",
"force_element_id": "xxx2"
}],
"crew": [
{
"name": "Nimitz- Starboard",
"force_element_id": "yyy1"
},
{
"name": "Nimitz- Port",
"force_element_id": "yyy2"
}]
}
This code creates the correct format for the output
class ForceElementType(Resource):
##jwt_required()
def get(self):
types = {}
force_elements = ForceElementModel.query.order_by(ForceElementModel.force_element_type.desc()).all()
for force_element in force_elements:
nested = {'name': force_element.name, 'force_element_id': str(force_element.force_element_id)}
print(nested)
if not force_element.force_element_type in types:
types[force_element.force_element_type] = []
types[force_element.force_element_type].append(nested)
response = types

How to access data in dictionary within list in python

I am currently working on a python program to query public github API url to get github user email address. The response from the python object is a huge list with a lot of dictionaries.
My code so far
import requests
import json
# username = ''
username = 'FamousBern'
base_url = 'https://api.github.com/users/{}/events/public'
url = base_url.format(username)
try:
res = requests.get(url)
r = json.loads(res.text)
# print(r) # List slicing
print(type(r)) # List that has alot dictionaries
for i in r:
if 'payload' in i:
print(i['payload'][6])
# matches = []
# for match in r:
# if 'author' in match:
# matches.append(match)
# print(matches)
# print(r[18:])
except Exception as e:
print(e)
# data = res.json()
# print(data)
# print(type(data))
# email = data['author']
# print(email)
By manually accessing this url in chrome browser i get the following
[
{
"id": "15069094667",
"type": "PushEvent",
"actor": {
"id": 32365949,
"login": "FamousBern",
"display_login": "FamousBern",
"gravatar_id": "",
"url": "https://api.github.com/users/FamousBern",
"avatar_url": "https://avatars.githubusercontent.com/u/32365949?"
},
"repo": {
"id": 332684394,
"name": "FamousBern/FamousBern",
"url": "https://api.github.com/repos/FamousBern/FamousBern"
},
"payload": {
"push_id": 6475329882,
"size": 1,
"distinct_size": 1,
"ref": "refs/heads/main",
"head": "f9c165226201c19fd6a6acd34f4ecb7a151f74b3",
"before": "8b1a9ac283ba41391fbf1168937e70c2c8590a79",
"commits": [
{
"sha": "f9c165226201c19fd6a6acd34f4ecb7a151f74b3",
"author": {
"email": "bernardberbell#gmail.com",
"name": "FamousBern"
},
"message": "Changed input functionality",
"distinct": true,
"url": "https://api.github.com/repos/FamousBern/FamousBern/commits/f9c165226201c19fd6a6acd34f4ecb7a151f74b3"
}
]
},
The json object is huge as well, i just sliced it. I am interested to get the email address in the author dictionary.
You're attempting to index into a dict() with i['payload'][6] which will raise an error.
My personal preferred way of checking for key membership in nested dicts is using the get method with a default of an empty dict.
import requests
import json
username = 'FamousBern'
base_url = 'https://api.github.com/users/{}/events/public'
url = base_url.format(username)
res = requests.get(url)
r = json.loads(res.text)
# for each dict in the list
for event in r:
# using .get() means you can chain .get()s for nested dicts
# and they won't fail even if the key doesn't exist
commits = event.get('payload', dict()).get('commits', list())
# also using .get() with an empty list default means
# you can always iterate over commits
for commit in commits:
# email = commit.get('author', dict()).get('email', None)
# is also an option if you're not sure if those keys will exist
email = commit['author']['email']
print(email)

How to do a POST web api call with groovy code?

I have some POST method url, 2 headers to pass and a big body in Json format, that I need to call through the Groovy code. But I am not sure on points like how to pass headers and big Json object in Groovy code for API call. Please help me on thease points. I am writin this code in visual code.
Grab(group='org.codehaus.groovy.modules.http-builder', module='http-builder', version='0.7.1' )
import groovyx.net.http.*
import static groovyx.net.http.ContentType.*
import static groovyx.net.http.Method.*
def post = new URL("https://xyxz/api/testRequest/generic").openConnection();
def message = '{
"test": "test",
"test1": "test1\n\t",
"test2": {
"test3": "test3",
"test4": "test4"
}'
post.setRequestMethod("POST")
post.setDoOutput(true)
post.setRequestProperty("Content-Type", "application/json")
post.setHeader("id","sadasdas1212134");
post.setHeader("id2","sdsd34sdsfdfdfdf");
post.getOutputStream().write(message.getBytes("UTF-8"));
def postRC = post.getResponseCode();
println(postRC);
if(postRC.equals(200)) {
println(post.getInputStream().getText());
}
Straight from the ref-doc
import groovyx.net.http.HttpBuilder
def body = [
"test": "test",
"test1": "test1\n\t",
"test2": [
"test3": "test3",
"test4": "test4"
]
]
def result = HttpBuilder.configure {
request.uri = 'https://xyxz/api/testRequest/generic'
request.headers.id = 'sadasdas1212134'
request.headers.id2 = 'sdsd34sdsfdfdfdf'
request.contentType = 'application/json'
request.body = body
}.post()
println result

How to fix HTML information not being returned by Beautiful Soup?

I am unable to retrieve product data I need from a website. I can see the HTML sections that I think I need to grab but my code returns no data. It works for certain HTML tags on that same page but not the one that I want.
I am a real beginner. I have watched youtube videos and tried to go through the questions/responses here. And from what I can tell it seems like the data I need from the website may be something other than html but embedded in the html(?).
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url='https://www.harristeeter.com/specials/weekly-list/best-deals'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
len(page_soup.findAll("div",{"class":"product_infoBox"}))
len(page_soup.findAll("div",{"class":"container"}))
In the code I can retrieve results for "container" (=5) but not "product_infoBox" (=0). "product_infoBox" is the section I need.
The page loads data dynamically via JSON, but you can obtain this data through requests as well. This script searches for a store, select first result and loads weekly specials:
import requests
from bs4 import BeautifulSoup
import json
store_search_url = 'https://www.harristeeter.com/api/v1/stores/search?Address={}&Radius=10000&AllStores=true&NewOrdering=false&OnlyPharmacy=false&OnlyFreshFood=false&FreshFoodOrdering=undefined'
weekly_specials_url = 'https://www.harristeeter.com/api/v1/stores/{}/departments/0/weekly_specials?'
headers = {'Referer': 'https://www.harristeeter.com/store-locator'}
with requests.session() as s:
r = s.get('https://www.harristeeter.com/store-locator', headers=headers)
store_search_data = s.get(store_search_url.format('pine ridge plaza, reynolda road'), headers=headers).json()
# This prints all results from store search:
# print(json.dumps(store_search_data, indent=4))
# we select the first match:
store_number = store_search_data['Data'][0]['Number']
weekly_specials_data = s.get(weekly_specials_url.format(store_number), headers=headers).json()
print(json.dumps(weekly_specials_data, indent=4))
Prints:
{
"Status": "success",
"Data": [
{
"ID": "4615146",
"AdWeek": "2019-07-16",
"DepartmentNumber": "4",
"AdWeekExpires": "07/16/2019",
"ActiveAdWeekRelease": "2019-07-16",
"StartDate": "7/10/2019",
"EndDate": "7/16/2019",
"IsCardRequired": true,
"Title": "Harris Teeter Cottage Cheese, Sour Cream, French",
"Description": "8-16 oz",
"Detail": "e-VIC Member Price $1.27",
"Price": "2/$3",
"SpecialPrice": "$1.27",
"DesktopImageUrl": "https://23360934715048b8b9a2-b55d76cb69f0e86ca2d9837472129d5a.ssl.cf1.rackcdn.com/sm_4615146.jpg",
"MobileImageUrl": "https://23360934715048b8b9a2-b55d76cb69f0e86ca2d9837472129d5a.ssl.cf1.rackcdn.com/sm_4615146.jpg",
"Limit": "6",
"Savings": "Save at Least 38\u00a2 on 2",
"Size": "8-16 oz",
"Subtitle": "Limit 6 at e-VIC Price",
"IsAdded": false,
"RetinaImageUrl": "https://23360934715048b8b9a2-b55d76cb69f0e86ca2d9837472129d5a.ssl.cf1.rackcdn.com/4615146.jpg",
"TIE": "1",
"Organic": "0",
"Type": "EVIC",
"DepartmentName": "Dairy & Chilled Foods"
},
{
"ID": "4614507",
... and so on.

Resources