Union in loop Pyspark - apache-spark

I have two dataframes
data1 = [{'text': 'We traveled a long way to several beautiful houses to see the cats.', 'lang': 'eng'},
{'text': 'قطعنا شوطا طويلا إلى عدة منازل جميلة لرؤية القطط.', 'lang': 'arb'},
{'text': 'Wir reisten einen langen Weg zu mehreren schönen Häusern, um die Katzen zu sehen.', 'lang': 'deu'},
{'text': 'Nous avons parcouru un long chemin vers plusieurs belles maisons pour voir les chats.', 'lang': 'fra'}]
sdf1 = spark.createDataFrame(data1)
data2 = [{'text': 'Przebyliśmy długą drogę do kilku pięknych domów, aby zobaczyć koty.', 'lang': 'pol'},
{'text': 'Mēs ceļojām garu ceļu uz vairākām skaistām mājām, lai redzētu kaķus.', 'lang': 'lav'},
{'text': 'Kedileri görmek için birkaç güzel eve uzun bir yol kat ettik.', 'lang': 'tur'}]
sdf2 = spark.createDataFrame(data2)
I want to add only specific language rows from sdf2 to the first dataframe. I do it with a loop:
langs = ['pol', 'tur']
for lang in langs:
sdf_l = sdf2.where(F.col('lang') == lang)
sdf_final = sdf1.union(sdf_l)
But it only appends rows from the last language in langs

There is no need to use loop here. Filter sdf2 first, and then unoin with sdf1.
import pyspark.sql.functions as F
...
langs = ['pol', 'tur']
sdf_final = sdf1.union(sdf2.filter(F.col('lang').isin(langs)))
If you expect to use loop, you can define a temporary variable and perform union with sdf1.
for lang in langs:
sdf_1 = sdf2.where(F.col('lang') == lang)
sdf1 = sdf1.union(sdf_1)
sdf1.show(truncate=False)

Related

Dataframe becoming empty after calling a method in DataWriter class that deletes records from delta table

How can I prevent the dataframe data from becoming empty after calling the delete_processed_data() method in my DataWriter class that also has a register_processed_data() method which inserts data into a delta table?
I'm not overwriting the dataframe and it's only being used as a condition to check if it has any data with a count.
Here's my complete code (databricks notebook):
from datetime import *
import pandas as pd
from dn.utils import table
import pyspark.sql.functions as F
from delta.tables import *
from pyspark.sql.types import *
import json
import pytz
import calendar
list_countries = (
table.get_silver_table(table_name='stpos_dim_itemticket')
.select('pais')
.distinct()
)
list_countries = [row.pais for row in list_countries.collect()]
# Include "Todos" option
list_countries.insert(0, 'Todos')
dbutils.widgets.removeAll()
dbutils.widgets.text(name='category', defaultValue='Todos', label='Categoria')
dbutils.widgets.text(name='today', defaultValue=str(date.today()), label='Fecha proceso')
dbutils.widgets.dropdown(name="country", defaultValue='Todos', choices=list_countries, label="Pais")
dbutils.widgets.dropdown(name='forced_load', defaultValue='no', choices=['si', 'no'], label='Forzar carga')
dbutils.widgets.dropdown(name="reprocessing", defaultValue='si', choices=['si', 'no'], label="Reproceso")
country = dbutils.widgets.get('country').strip()
category = dbutils.widgets.get("category").strip()
today = datetime.strptime(dbutils.widgets.get('today').strip(), '%Y-%m-%d')
wave_date = today.replace(day=1)
forced_load = dbutils.widgets.get('forced_load').strip()
reprocessing = dbutils.widgets.get('reprocessing').lower().strip()
print(f"Categoria: {category}")
print(f"Fecha proceso: {today.strftime('%Y-%m-%d')}")
print(f"Pais: {country}")
print(f"Forzar carga: {forced_load}")
print(f'Reproceso: {reprocessing}')
print(f"Fecha ola: {wave_date.strftime('%Y-%m-%d')}")
class DataExtractor():
def __init__(self, category, today, country, list_countries, wave_date, reprocessing, forced_load):
self.category = category
self.today = today
self.country = country
self.list_countries = list_countries
self.wave_date = wave_date
self.reprocessing = reprocessing
self.forced_load = forced_load
if self.reprocessing == 'no' or self.forced_load == 'si':
self.days_for_wave = self.get_days_for_wave()
if self.country.lower() == 'todos':
self.country_condition = "lower(pais) = lower(pais)"
else:
self.country_condition = f"lower(pais) = lower('{country}')"
if self.category.lower() == 'todos':
self.category_condition = "lower(categoria) = lower(categoria)"
else:
self.category_condition = f"lower(categoria) = lower('{category}')"
def get_days_for_wave_by_country(self, country, path_file):
days_for_wave = (
spark.read.format("com.crealytics.spark.excel")
.option("header", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "true")
.load(path_file)
.where(f"fecha_ola = '{self.wave_date}'")
.where(f"lower(pais) = lower('{country}')")
.selectExpr(
"fecha_ola",
"to_date(fecha) as fecha_transaccion",
"pais")
)
if days_for_wave.count() == 0:
# Año y mes deseado
year = self.wave_date.year
month = self.wave_date.month
# Obtener el número de días en el mes especificado
_, num_days = calendar.monthrange(year, month)
# Crear una lista con todos los días del mes
days = [(date(year, month, day),) for day in range(1, num_days+1)]
# Convertir cada fecha a una cadena de texto
days_str = [(day[0].strftime("%Y-%m-%d"),) for day in days]
# Convert list to dataframe
days_for_wave = (
spark.createDataFrame(days_str)
.withColumnRenamed("_1", "fecha_transaccion")
.withColumn("fecha_ola", F.lit(self.wave_date))
.withColumn("pais", F.lit(country))
.selectExpr(
"fecha_ola",
"to_date(fecha_transaccion) AS fecha_transaccion",
"pais")
)
print(f"Loaded {days_for_wave.count()} days for wave {self.wave_date.strftime('%Y-%m-%d')} and country {country}")
return days_for_wave
def get_days_for_wave(self):
"""
Get the days for the wave
"""
# Load dim_dia_ola.xlsx with wave definition
path_file = "dbfs:/mnt/storetrack/transitraw/dichterneira/storelive/dim_dia_ola.xlsx"
print(f'Loading days for wave from file: {path_file}...')
if self.country.lower() == 'todos':
# Get list of countries (excluding 'Todos')
list_of_countries = self.list_countries[1:]
else:
list_of_countries = [self.country]
schema = StructType([
StructField("fecha_ola", TimestampType(), nullable=True),
StructField("fecha_transaccion", DateType(), nullable=True),
StructField("pais", StringType(), nullable=True)
])
# Crear un DataFrame vacío con el esquema especificado
days_for_wave = spark.createDataFrame([], schema=schema)
for country in list_of_countries:
days_for_wave_by_country = self.get_days_for_wave_by_country(country, path_file)
max_day_of_wave = days_for_wave_by_country.agg(F.max("fecha_transaccion")).collect()[0][0]
if self.today.date() > max_day_of_wave and self.forced_load == 'no':
print(f"Today {self.today.strftime('%Y-%m-%d')} is not included in wave days for country {country} and wave {self.wave_date.strftime('%Y-%m-%d')}")
else:
if country == list_of_countries[0]:
days_for_wave = days_for_wave_by_country
else:
days_for_wave = days_for_wave.union(days_for_wave_by_country)
return days_for_wave
def get_data_items(self):
"""
Filter sales by category, wave and country
"""
if self.reprocessing == 'si' and self.forced_load == 'no':
sales_filtered = (
table.get_silver_table(table_name='sl_fact_item_ticket')
.where(f"fecha_ola = '{self.wave_date}'")
.where(self.country_condition)
.where(self.category_condition)
)
else:
sales_filtered = (
table.get_silver_table(table_name='stpos_dim_itemticket')
.drop("fecha_ola")
.where(self.country_condition)
.where(self.category_condition)
.selectExpr("*", "to_date(date) as fecha_transaccion")
.join(self.days_for_wave, ["fecha_transaccion", "pais"], how="inner")
.drop("fecha_transaccion")
)
print(f"{sales_filtered.count()} items loaded. [Get data items]")
return sales_filtered
def get_product_catalog(self):
product_catalog = (
table.get_bronze_table(table_name='brz_catalogo_productos', module_name='catalogo')
.where(self.country_condition)
.selectExpr(
"upc as barcode",
"pais",
"categoria",
"marca",
"submarca",
"fabricante",
"""CASE WHEN lower(split(contenido, ' ')[1]) = 'ml' THEN 'L'
WHEN lower(split(contenido, ' ')[1]) = 'gr' THEN 'Kg'
WHEN lower(split(contenido, ' ')[1]) = 'und' THEN 'Und'
END AS unidad_std""",
"conversion AS contenido_std",
"split(contenido, ' ')[0] AS contenido",
"split(contenido, ' ')[1] AS unidad_medida",
"idref AS id_ref"
)
)
return product_catalog
class DataEnricher():
def __init__(self, reprocessing, forced_load):
self.reprocessing = reprocessing
self.forced_load = forced_load
def rename_fields(self, df_item):
if self.reprocessing == 'no' or self.forced_load == 'si':
print("Renaming fields...")
df_item = (
df_item
.selectExpr(
'CAST(fecha_ola AS DATE) AS fecha_ola',
'pdv AS nombre_pdv',
'marca',
'submarca',
'pais',
'contenido',
'unidad_medida',
'CAST(cantidad AS DOUBLE) as cantidad',
'CAST(precio_local AS DOUBLE) as precio_local',
'barcode',
'date AS fecha_transaccion',
'categoria',
'categoria_name',
'descripcion',
'id_ref',
'posdb_id',
'id_ticket',
'id_item',
'id_pdv',
'venta_usd',
'venta_local',
'precio_usd',
'id_canasto'
)
)
return df_item
def calculate_standard_fields(self, df_item):
if self.reprocessing == 'no' or self.forced_load == 'si':
print("Caculating standard fields...")
df_item = (
df_item
# Add column with converted Ml to L and Gr to Kg
.withColumn("contenido_std",
F.when(F.col("unidad_medida") == "Ml", F.col("contenido") / 1000)
.when(F.col("unidad_medida") == "Gr", F.col("contenido")/1000)
.otherwise(F.col("contenido")))
.withColumn("unidad_std",
F.when(F.col("unidad_medida") == "Ml", F.lit("L"))
.when(F.col("unidad_medida") == "Gr", F.lit("Kg")))
)
return df_item
def calculate_fields(self, df_items):
"""
Set the time zone of the dataframe
"""
if self.reprocessing == 'no' or self.forced_load == 'si':
print("Calulating time zone field...")
# Create dataframe with the time zone
time_zone = [(1, '05:00:00', '09:59:59'),
(2, '10:00:00', '13:59:59'),
(3, '14:00:00', '19:59:59'),
(4, '20:00:00', '23:59:59'),
(4, '00:00:00', '04:59:59')]
time_zone = spark.createDataFrame(time_zone, ['id_franja', 'inicio', 'fin'])
# Convert inicio and fin to datetime
time_zone = (
time_zone
.withColumn("inicio", F.to_timestamp(F.col("inicio"), "HH:mm:ss"))
.withColumn("fin", F.to_timestamp(F.col("fin"), "HH:mm:ss"))
)
df_items = (
df_items
.withColumn("hora_transaccion", F.substring(F.col("fecha_transaccion"), 12, 8))
.withColumn("hora_transaccion", F.to_timestamp(F.col("hora_transaccion"), "HH:mm:ss"))
.join(time_zone, on=F.col("hora_transaccion").between(F.col("inicio"), F.col("fin")), how="left")
.drop("hora_transaccion", "inicio", "fin")
)
return df_items
def update_product_features(self, data, product_catalog):
if data.count() > 0:
print("Updating fields from brz_catalogo_productos")
data = (
data
.drop("categoria", "marca", "submarca", "fabricante", "unidad_std", "contenido_std", "contenido", "unidad_medida", "id_ref")
.join(product_catalog, on=["barcode", "pais"], how="left")
)
return data
class DataWriter():
def __init__(self, wave_date, country, category):
self.wave_date = wave_date
self.country = country
self.category = category
if self.country.lower() == 'todos':
self.country_condition = "lower(pais) = lower(pais)"
else:
self.country_condition = f"lower(pais) = lower('{country}')"
if self.category.lower() == 'todos':
self.category_condition = "lower(categoria) = lower(categoria)"
else:
self.category_condition = f"lower(categoria) = lower('{category}')"
def delete_processed_data(self, datos):
df_categoria_activa = (
table.get_bronze_table(
table_name='sl_configuracion_procesamiento_zona_silver',
module_name='storetrack'
)
.where(f"fecha_ola = '{wave_date}' and lower(trim(procesar)) = 'si'")
.where(self.country_condition)
.where(self.category_condition)
.selectExpr(
"categoria",
"pais",
"fecha_ola"
)
)
if datos.count() > 0:
display(datos.where("categoria = 'Galletas dulces'"))
table_path = table.get_silver_table_path(table_name="sl_fact_item_ticket")
deltaTableToWrite = DeltaTable.forPath(spark, table_path)
print("Deleting old rows...")
deltaTableToWrite.alias('current')\
.merge(
df_categoria_activa.alias('delete'),
'current.pais = delete.pais AND current.categoria = delete.categoria AND current.fecha_ola = delete.fecha_ola')\
.whenMatchedDelete()\
.execute()
display(datos.where("categoria = 'Galletas dulces'"))
def register_processed_data(self, data):
if data.count() > 0:
print("Inserting new rows...")
display(data.where("categoria = 'Galletas dulces'"))
table_path = table.get_silver_table_path(table_name="sl_fact_item_ticket")
deltaTableToWrite = DeltaTable.forPath(spark, table_path)
deltaTableToWrite.alias('current')\
.merge(
data.alias('new'),
'current.id_item = new.id_item AND current.fecha_ola = new.fecha_ola')\
.whenNotMatchedInsert(values =
{
"fecha_ola": "new.fecha_ola",
"marca": "new.marca",
"submarca": "new.submarca",
"pais": "new.pais",
"contenido": "new.contenido",
"unidad_medida": "new.unidad_medida",
"cantidad": "new.cantidad",
"precio_local": "new.precio_local",
"barcode": "new.barcode",
"fecha_transaccion": "new.fecha_transaccion",
"categoria": "new.categoria",
"categoria_name": "new.categoria_name",
"descripcion": "new.descripcion",
"id_ref": "new.id_ref",
"posdb_id": "new.posdb_id",
"id_ticket": "new.id_ticket",
"id_item": "new.id_item",
"id_pdv": "new.id_pdv",
"venta_usd": "new.venta_usd",
"venta_local": "new.venta_local",
"precio_usd": "new.precio_usd",
"nombre_pdv": "new.nombre_pdv",
"contenido_std": "new.contenido_std",
"unidad_std": "new.unidad_std",
"id_canasto": "new.id_canasto",
"id_franja": "new.id_franja"
}
)\
.execute()
display(data.where("categoria = 'Galletas dulces'"))
print(f"{data.count()} items loaded. [Write processed data]")
else:
print("No data to save in silver.sl_fact_item_ticket")
if __name__ == '__main__':
data_extractor = DataExtractor(category, today, country, list_countries, wave_date, reprocessing, forced_load)
data = data_extractor.get_data_items()
product_catalog = data_extractor.get_product_catalog()
cleaner = DataCleaner(wave_date, country, category, reprocessing, forced_load)
data = cleaner.clean_data(data)
data_enricher = DataEnricher(reprocessing, forced_load)
data = data_enricher.rename_fields(data)
data = data_enricher.calculate_standard_fields(data)
data = data_enricher.calculate_fields(data)
data = data_enricher.update_product_features(data, product_catalog)
data_write = DataWriter(wave_date, country, category)
data_write.delete_processed_data(data)
data_write.register_processed_data(data)
The parameters with which I am running the notebook are:
Categoria: Todos
Fecha proceso: 2022-12-01
Pais: Todos
Forzar carga: no
Reproceso: si
Fecha ola: 2022-12-01
The following output is displayed:
993313 items loaded. [Get data items]
62023 items loaded. [Remove blocked categories]
Updating fields from brz_catalogo_productos
[DISLPLAY ROWS DATAFRAME data]
Deleting old rows...
Query returned no results
No data to save in silver.sl_fact_item_ticket
Any insights on why the dataframe is getting cleared would be greatly appreciated.

Python Code only running in Debugging mode

So I've been working on a script that a co-worker of mine made now i fixed some of his issues but i cannot seem to figure out why it only actually works when i run it in debugging mode in VSC even when i run it from a normal python shell it does not give me the output files that it does when running in debug mode does anyone know why? (Some Links and sensitive company data has been removed)
here is the code:
import requests
from requests.auth import HTTPBasicAuth
import json
import csv
import os
import pandas as pd
import datetime
import urllib3
from datetime import datetime, timedelta
#______________________________________________________________________________________
#main functie
def Main():
#http request met api account naar de export lijst Rapid7
urllib3.disable_warnings() #negeert de waarschuwingen van de self signed certificaten
url = "URL REMOVED"
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED))
#data opslaan van de http request in csv formaat
with open('downloaded.csv', 'wb') as csv_file:
csv_file.write(r.content)
#open het input bestand van AD
Filenameslist = "C:\Robert-Code\ComputerListForRapid7.json" #volledig pad naar het bestand toegevoegd
with open(Filenameslist) as f:
data = json.load(f)
#converteer json naar een CSV-besand
with open("computerlist.csv", "w") as f:
fieldnames = data[3].keys()
# haal de keys van de 3e regel, want soms is de eerste regel van de sourcefile leeg
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
#Ik gebruik de Pandas module om alleen de kolom "name" van het Rapid7 bestand te krijgen die ik nodig heb.
# Dan draai ik naam en adres om, zodat de naam in de eerste kolom staat. Hiervoor lees ik de CSV in een dataframe
dfR7 = pd.read_csv("downloaded.csv")
titles = list(dfR7.columns)
titles[0],titles[1] = titles[1],titles[0] #draai om, zodat name in de eerste kolom staat
dfR7 = dfR7[titles] # zet de kolommen en data goed in het object
dfR7.sort_values(["Name"], inplace = True)
dfR7.drop(columns=["Address","Operating System","Site","Exploits","Malware","Vulnerabilities","Risk","Last Scan","Assessed"], inplace=True)
dfR7["Name"] = dfR7["Name"].str.split('.').str[0] #strip het domein van het FQDN
dfR7["Name"] = dfR7["Name"].str.lower() # alles lowercase
#Pandas module om ook van het AD-betand 1 kolom "name" over te houden in het object dfAD zodat ik later kan vergelijken.
dfAD = pd.read_csv("computerlist.csv")
dfAD.drop(columns=["DNSHostName","OperatingSystem","IPAddress", "LastLogon"], inplace= True)
dfAD["Computer"] = dfAD["Computer"].str.lower()
#beide objecten opslaan in een csv-bestand deze te vergelijken
dfR7.to_csv("fr7.csv", index=False)
dfAD.to_csv("fAD.csv", index=False)
with open('fr7.csv', 'r') as t1, open('fAD.csv', 'r') as t2:
fileRapid = t1.readlines()
fileAD = t2.readlines()
#de bestanden fr7.csv en fad.csv vergelijken aan de hand van een for loop
# deze dan opslaan in update.csv
with open('update.csv', 'w') as outFile:
for line in fileAD:
if line not in fileRapid:
outFile.write(line)
#hier haal ik weer het oude bestand van AD erbij om deze zometeen te mergen met het net gemaakte update.csv bestand
# zodat ik alle nuttige kolommen weer heb
dfAD = pd.read_csv("computerlist.csv")
dfAD["Computer"] = dfAD["Computer"].str.lower()
dfAD.to_csv("f1AD.csv", index=False)
# merge functie van de Pandas module
data1 = pd.read_csv('update.csv')
data2 = pd.read_csv("f1AD.csv")
output1 = pd.merge(data1, data2,
on='Computer',
how='inner')
#opslaan naar TotalresultsAD.csv
output1.to_csv("totaldifferenceAD_R7.csv", index =False)
#met de datetime module maak ik een variabele: time met de dag van vandaag minus 30 dagen
time = datetime.today() - timedelta(60)
"lees 2 x het bestand in"
dfgood = pd.read_csv("totaldifferenceAD_R7.csv")
dfbad = pd.read_csv("totaldifferenceAD_R7.csv")
#dit outputbestand geeft de assets weer die een LastLogon hebben recenter dan 30 dagen geleden
dfgood['LastLogon'] = pd.to_datetime(dfgood['LastLogon'], errors = 'coerce') #errors = 'coerce' zorgt ervoor dat de foutieve waarden in de kolom LastLogon worden genegeerd
dfgood.sort_values(["LastLogon"], inplace = True)
dfnew = (dfgood['LastLogon'] >= time)
dfnew = dfgood.loc[dfnew]
#dit outputbestand geeft de assets weer die een LastLogon hebben ouder dan 30 dagen geleden
dfbad['LastLogon'] = pd.to_datetime(dfbad['LastLogon'], errors = 'coerce') #errors = 'coerce' zorgt ervoor dat de foutieve waarden in de kolom LastLogon worden genegeerd
dfbad.sort_values(["LastLogon"], inplace = True)
newdf2 = (dfbad['LastLogon'] < time)
newdf2 = dfbad.loc[newdf2]
#wegschrijven uiteindelijke bestanden
dfnew.to_csv("newer_than_60_days.csv",index =False)
newdf2.to_csv("older_than_60_days.csv",index =False)
#opschonen van de bestanden
os.remove("FAD.csv")
os.remove("fr7.csv")
os.remove("computerlist.csv")
os.remove("downloaded.csv")
os.remove("f1AD.csv")
os.remove("update.csv")
if __name__=="__main__":
Main() ```
Thanks in advance for any help
Because I don't have a high enough SO reputation, unfortunately I can't simply comment this and need to make it an 'Answer'.
Changing
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED))
to
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED'))
will get the syntax highlighting all fixed up and may make it easier for someone smarter than me to assist you :)
Something that I've previously come across (primarily with web scraping packages) were functions that didn't play nicely with relative paths - perhaps changing them to absolute paths using os.path.abspath(".....") may help? It's a stab in the dark so that this 'Answer' actually has a potentially useful element to it, but may be an adjustment worth exploring.

Python scraping regex (word just next to the number)

I hope you're well. I'd like to scrape different data with regex :)
#Récupération des ingrédients
try:
ingredients = [item.text.replace("\n", "").strip() for item in soup.find_all("li", {"class": "recipe-ingredients__list__item"})]
except Exception as e:
ingredients = None
Here is the json result
"ingredients": [
"250g de porc h\u00e2ch\u00e9 (le filet mignon c'est vraiment bon)",
"1 oignon blanc",
"1 carotte",
"6 champignons parfum\u00e9s chinois (pas des champignons noirs)",
"1poign\u00e9e de vermicelles de riz (cheveux d'ange)",
"1poign\u00e9e de germes de soja",
"3 oeufs",
"2gousses d'ail",
"Galette de riz vietnamiennes (les grandes)",
"4cuill\u00e8res \u00e0 soupe de nuoc mam",
"Poivre"
Do you how I can scrape separately
the quantity (here is the number)
the quantifying (which always sticks to the number when it exists)
the name of the ingredient
I do not find how to do it with regex
Thanks for your response #Ryszard Czech :) it's the first time use regex. If I want to save directly the separated data in json:
what should be the code something like that?
#Récupération des ingrédients
try:
ingredients = [item.text.replace("\n", "").strip() for item in soup.find_all("li", {"class": "recipe-ingredients__list__item"}, [re.compile(r'^(?:(\d+)([^\W\d_]*))?(.*)', x), for x in ingredients])]
except Exception as e:
ingredients = None
Or do I need to use some pattern to apply to ingredients
Use
import json, re
j="""{"ingredients": [
"250g de porc h\u00e2ch\u00e9 (le filet mignon c'est vraiment bon)",
"1 oignon blanc",
"1 carotte",
"6 champignons parfum\u00e9s chinois (pas des champignons noirs)",
"1poign\u00e9e de vermicelles de riz (cheveux d'ange)",
"1poign\u00e9e de germes de soja",
"3 oeufs",
"2gousses d'ail",
"Galette de riz vietnamiennes (les grandes)",
"4cuill\u00e8res \u00e0 soupe de nuoc mam",
"Poivre"]}"""
jsObj = json.loads(j)
print( [re.findall(r'^(?:(\d+)([^\W\d_]*))?(.*)', x) for x in jsObj["ingredients"]] )
Output:
[[('250', 'g', " de porc hâché (le filet mignon c'est vraiment bon)")], [('1', '', ' oignon blanc')], [('1', '', ' carotte')], [('6', '', ' champignons parfumés chinois (pas des champignons noirs)')], [('1', 'poignée', " de vermicelles de riz (cheveux d'ange)")], [('1', 'poignée', ' de germes de soja')], [('3', '', ' oeufs')], [('2', 'gousses', " d'ail")], [('', '', 'Galette de riz vietnamiennes (les grandes)')], [('4', 'cuillères', ' à soupe de nuoc mam')], [('', '', 'Poivre')]]
The ^(?:(\d+)([^\W\d_]*))?(.*) expression matches optionally one or more digits (capture 1) and an optional letters after (capture 2), and then captures the rest into capture 3.

Iteration failure when using BeautifulSoup

I'm using BeautifulSoup to try to extract data from a web page. But for some reason it fails to iterate over items found in season greater than 1. There is seemingly no reason for this behavior as the nodes look exactly the same to me.
def scrape_show(show):
source = requests.get(show.url).text
soup = BeautifulSoup(source, 'lxml')
# All seasons and episodes
area = soup.find('div', class_='play_video-area-aside play_video-area-aside--related-videos play_video-area-aside--related-videos--titlepage')
for article in area:
if "season" in article.get('id'):
season = article.h2.a.find('span', class_='play_accordion__section-title-inner').text
print(season + " -- " + article.get('id'))
# All content for the given season
ul = article.find('ul')
if ul is None:
print("null!") # This should not happen
Example Output:
Season 1 -- section-season1-xxxx
Season 2 -- section-season2-xxxx
null!
https://www.svtplay.se/andra-aket (url from example)
The data is not available in HTML form for all seasons, only for season 1. But the information is embedded in the page in JSON form. You can parse this data with re and json module:
import re
import json
import requests
url = 'https://www.svtplay.se/andra-aket?tab=season-1-18927182'
data = json.loads( re.findall(r"root\['__svtplay_apollo'\] = (\{.*?\});", requests.get(url).text)[0] )
from pprint import pprint
# pprint(data) # <-- uncommment this to see all the data
for k in data:
if k.startswith('Episode:') or (k.startswith('$Episode:') and k.endswith('urls')):
print(k)
pprint(data[k])
print('-' * 80)
Prints (data about episodes 1 and 2 and their URLs):
Episode:1383301-001
{'__typename': 'Episode',
'accessibilities': {'json': ['AudioDescribed', 'SignInterpreted'],
'type': 'json'},
'duration': 1700,
'id': '1383301-001',
'image': {'generated': False,
'id': 'Image:18926434',
'type': 'id',
'typename': 'Image'},
'live': None,
'longDescription': 'Madde och Petter flyttar tillsammans med sin 13-åriga '
'dotter Ida till Björkfjället, en liten skidort i svenska '
'fjällen. Madde är uppvuxen där men för '
'Stockholms-hipstern Petter är det ett chockartat '
'miljöombyte. Maddes mamma Ingegerd har gått i pension och '
'lämnat över ansvaret för familjens lilla hotell till '
'Madde. Hon och Petter ska nu driva "Gammelgården" med '
'Maddes bror Tommy, vilket visar sig vara en inte helt '
'lätt uppgift. I rollerna: Sanna Sundqvist, Jakob '
'Setterberg, William Spetz, Bert-Åke Varg, Mattias '
'Fransson och Lena T Hansson. Del 1 av 8.',
'name': 'Avsnitt 1',
'nameRaw': '',
'positionInSeason': 'Säsong 1 — Avsnitt 1',
'restrictions': {'generated': True,
'id': '$Episode:1383301-001.restrictions',
'type': 'id',
'typename': 'Restrictions'},
'slug': 'avsnitt-1',
'svtId': 'jBD1gw8',
'urls': {'generated': True,
'id': '$Episode:1383301-001.urls',
'type': 'id',
'typename': 'Urls'},
'validFrom': '2019-07-25T02:00:00+02:00',
'validFromFormatted': 'Tor 25 jul 02:00',
'validTo': '2020-01-21T23:59:00+01:00',
'variants': [{'generated': False,
'id': 'Variant:1383301-001A',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001S',
'type': 'id',
'typename': 'Variant'},
{'generated': False,
'id': 'Variant:1383301-001T',
'type': 'id',
'typename': 'Variant'}],
'videoSvtId': '8PbQdAj'}
--------------------------------------------------------------------------------
$Episode:1383301-001.urls
{'__typename': 'Urls',
'svtplay': '/video/19970142/andra-aket/andra-aket-sasong-1-avsnitt-1'}
--------------------------------------------------------------------------------
... and so on.

cleaning multi terms fron stopwords

I have a list of expressions, and I need to remove from these expressions the stopwords.
ex = ["andare con i piedi di piombo", "avere gli occhi foderati di prosciutto", 'non chiudere occhio', 'con le mani nel sacco']
stopwords = ["ad","al", "allo", "ai","agli", "all", "alla", "col", "in", "il", "della", "un", "con", "non", "i", "di", "le", "nei", "gli"]
I tried this
for es in ex:
new_ex = ''
for word in stopwords:
new_es = es.replace(" " +word+ " ", "")
print(new_es)
The above code does not remove the stopwords
Can someone help?
using your example
ex = ["andare con i piedi di piombo", "avere gli occhi foderati di prosciutto", 'non chiudere occhio', 'con le mani nel sacco']
stopwords = ["ad","al", "allo", "ai","agli", "all", "alla", "col", "in", "il", "della", "un", "con", "non", "i", "di", "le", "nei", "gli"]
you could go with:
for es in ex:
es = es.split()
new_es = ''
for word in es:
if word not in stopwords:
new_es += word + ' '
print(new_es)
This will do the job:
sentences = [
'andare con i piedi di piombo',
'avere gli occhi foderati di prosciutto',
'non chiudere occhio',
'con le mani nel sacco'
]
words = [
'ad',
'al',
'allo',
'ai',
'agli',
'all',
'alla',
'col',
'in',
'il',
'della',
'un',
'con',
'non',
'i',
'di',
'le',
'nei',
'gli'
]
for sentence in sentences:
s = sentence
for word in words:
s = s.replace(f' {word} ', '')
print(s)
The problem in your code is that you need to make new_ex = ex and use replace() on new_ex.
Here is the output of the code above:
andarei piedipiombo
avereocchi foderatiprosciutto
non chiudere occhio
conmani nel sacco
Also note that 'non chiudere occhio' remains the same because you are looking for a word padded by spaces.

Resources