cleaning multi terms fron stopwords - python-3.x

I have a list of expressions, and I need to remove from these expressions the stopwords.
ex = ["andare con i piedi di piombo", "avere gli occhi foderati di prosciutto", 'non chiudere occhio', 'con le mani nel sacco']
stopwords = ["ad","al", "allo", "ai","agli", "all", "alla", "col", "in", "il", "della", "un", "con", "non", "i", "di", "le", "nei", "gli"]
I tried this
for es in ex:
new_ex = ''
for word in stopwords:
new_es = es.replace(" " +word+ " ", "")
print(new_es)
The above code does not remove the stopwords
Can someone help?

using your example
ex = ["andare con i piedi di piombo", "avere gli occhi foderati di prosciutto", 'non chiudere occhio', 'con le mani nel sacco']
stopwords = ["ad","al", "allo", "ai","agli", "all", "alla", "col", "in", "il", "della", "un", "con", "non", "i", "di", "le", "nei", "gli"]
you could go with:
for es in ex:
es = es.split()
new_es = ''
for word in es:
if word not in stopwords:
new_es += word + ' '
print(new_es)

This will do the job:
sentences = [
'andare con i piedi di piombo',
'avere gli occhi foderati di prosciutto',
'non chiudere occhio',
'con le mani nel sacco'
]
words = [
'ad',
'al',
'allo',
'ai',
'agli',
'all',
'alla',
'col',
'in',
'il',
'della',
'un',
'con',
'non',
'i',
'di',
'le',
'nei',
'gli'
]
for sentence in sentences:
s = sentence
for word in words:
s = s.replace(f' {word} ', '')
print(s)
The problem in your code is that you need to make new_ex = ex and use replace() on new_ex.
Here is the output of the code above:
andarei piedipiombo
avereocchi foderatiprosciutto
non chiudere occhio
conmani nel sacco
Also note that 'non chiudere occhio' remains the same because you are looking for a word padded by spaces.

Related

Union in loop Pyspark

I have two dataframes
data1 = [{'text': 'We traveled a long way to several beautiful houses to see the cats.', 'lang': 'eng'},
{'text': 'قطعنا شوطا طويلا إلى عدة منازل جميلة لرؤية القطط.', 'lang': 'arb'},
{'text': 'Wir reisten einen langen Weg zu mehreren schönen Häusern, um die Katzen zu sehen.', 'lang': 'deu'},
{'text': 'Nous avons parcouru un long chemin vers plusieurs belles maisons pour voir les chats.', 'lang': 'fra'}]
sdf1 = spark.createDataFrame(data1)
data2 = [{'text': 'Przebyliśmy długą drogę do kilku pięknych domów, aby zobaczyć koty.', 'lang': 'pol'},
{'text': 'Mēs ceļojām garu ceļu uz vairākām skaistām mājām, lai redzētu kaķus.', 'lang': 'lav'},
{'text': 'Kedileri görmek için birkaç güzel eve uzun bir yol kat ettik.', 'lang': 'tur'}]
sdf2 = spark.createDataFrame(data2)
I want to add only specific language rows from sdf2 to the first dataframe. I do it with a loop:
langs = ['pol', 'tur']
for lang in langs:
sdf_l = sdf2.where(F.col('lang') == lang)
sdf_final = sdf1.union(sdf_l)
But it only appends rows from the last language in langs
There is no need to use loop here. Filter sdf2 first, and then unoin with sdf1.
import pyspark.sql.functions as F
...
langs = ['pol', 'tur']
sdf_final = sdf1.union(sdf2.filter(F.col('lang').isin(langs)))
If you expect to use loop, you can define a temporary variable and perform union with sdf1.
for lang in langs:
sdf_1 = sdf2.where(F.col('lang') == lang)
sdf1 = sdf1.union(sdf_1)
sdf1.show(truncate=False)

split punctuations without using regex

Given input:
Democr _acy , is overrat _ed .
Desired output:
Democracy, is overrated.
Here is my code:
sentence=input()
punctuation = "!\"#$%&'()*+,-./:;<=>?#[\]^`{|}~"
suffixes = ["acy", "ance", "ence", "dom", "er", "or", "ism", "ist",
"ty", "ment", "ness", "ship", "sion", "tion", "ate",
"en", "fy", "ize", "able", "ible", "al",
"esque", "ful", "ic", "ous", "ish", "ive",
"less", "ed", "ing", "ly", "ward", "wise"]
sentence_list = sentence.split('_')
c=""
if c not in punctuation:
print("".join(sentence_list))
elif c in punctuation:
for c in sentence:
print("".join(sentence_list).split(c))
As you can see my output has 29 different list but I just want one of them.
I want to remove '' from words and join punctuations and words which I removed '' from it.
When I write a code like:
sentence_list = sentence.split('_')
print("".join(sentence_list))
'_' and punctuations disseapear.Where am i doing wrong?
This is how I would tackle this problem.
def combineSentence(si):
punctuation = "!\"#$%&'()*+,-./:;<=>?#[\]^`{|}~"
rslt = ''
ptr = 0
while ptr < len(si):
if si[ptr] not in punctuation and si[ptr] != '_':
rslt += si[ptr]
else:
rslt = rslt[:-1]
if si[ptr] in punctuation:
rslt += si[ptr]
ptr += 1
return rslt
executing combineSentence('Democr _acy , is overrat _ed .') will yield:
'Democracy, is overrated.'

Python scraping regex (word just next to the number)

I hope you're well. I'd like to scrape different data with regex :)
#Récupération des ingrédients
try:
ingredients = [item.text.replace("\n", "").strip() for item in soup.find_all("li", {"class": "recipe-ingredients__list__item"})]
except Exception as e:
ingredients = None
Here is the json result
"ingredients": [
"250g de porc h\u00e2ch\u00e9 (le filet mignon c'est vraiment bon)",
"1 oignon blanc",
"1 carotte",
"6 champignons parfum\u00e9s chinois (pas des champignons noirs)",
"1poign\u00e9e de vermicelles de riz (cheveux d'ange)",
"1poign\u00e9e de germes de soja",
"3 oeufs",
"2gousses d'ail",
"Galette de riz vietnamiennes (les grandes)",
"4cuill\u00e8res \u00e0 soupe de nuoc mam",
"Poivre"
Do you how I can scrape separately
the quantity (here is the number)
the quantifying (which always sticks to the number when it exists)
the name of the ingredient
I do not find how to do it with regex
Thanks for your response #Ryszard Czech :) it's the first time use regex. If I want to save directly the separated data in json:
what should be the code something like that?
#Récupération des ingrédients
try:
ingredients = [item.text.replace("\n", "").strip() for item in soup.find_all("li", {"class": "recipe-ingredients__list__item"}, [re.compile(r'^(?:(\d+)([^\W\d_]*))?(.*)', x), for x in ingredients])]
except Exception as e:
ingredients = None
Or do I need to use some pattern to apply to ingredients
Use
import json, re
j="""{"ingredients": [
"250g de porc h\u00e2ch\u00e9 (le filet mignon c'est vraiment bon)",
"1 oignon blanc",
"1 carotte",
"6 champignons parfum\u00e9s chinois (pas des champignons noirs)",
"1poign\u00e9e de vermicelles de riz (cheveux d'ange)",
"1poign\u00e9e de germes de soja",
"3 oeufs",
"2gousses d'ail",
"Galette de riz vietnamiennes (les grandes)",
"4cuill\u00e8res \u00e0 soupe de nuoc mam",
"Poivre"]}"""
jsObj = json.loads(j)
print( [re.findall(r'^(?:(\d+)([^\W\d_]*))?(.*)', x) for x in jsObj["ingredients"]] )
Output:
[[('250', 'g', " de porc hâché (le filet mignon c'est vraiment bon)")], [('1', '', ' oignon blanc')], [('1', '', ' carotte')], [('6', '', ' champignons parfumés chinois (pas des champignons noirs)')], [('1', 'poignée', " de vermicelles de riz (cheveux d'ange)")], [('1', 'poignée', ' de germes de soja')], [('3', '', ' oeufs')], [('2', 'gousses', " d'ail")], [('', '', 'Galette de riz vietnamiennes (les grandes)')], [('4', 'cuillères', ' à soupe de nuoc mam')], [('', '', 'Poivre')]]
The ^(?:(\d+)([^\W\d_]*))?(.*) expression matches optionally one or more digits (capture 1) and an optional letters after (capture 2), and then captures the rest into capture 3.

Spacy ent.label_ cannot define organization

I am using spacy to analyze the terrorist and it is weird that spacy cannot find the organization such as fatah. The code is below
import spacy
nlp = spacy.load('en')
def read_file_to_list(file_name):
with open(file_name, 'r') as file:
return file.readlines()
terrorism_articles = read_file_to_list('data/rand-terrorism-dataset.txt')
terrorism_articles_nlp = [nlp(art) for art in terrorism_articles]
common_terrorist_groups = [
'taliban',
'al - qaeda',
'hamas',
'fatah',
'plo',
'bilad al - rafidayn'
]
common_locations = [
'iraq',
'baghdad',
'kirkuk',
'mosul',
'afghanistan',
'kabul',
'basra',
'palestine',
'gaza',
'israel',
'istanbul',
'beirut',
'pakistan'
]
location_entity_dict = defaultdict(Counter)
for article in terrorism_articles_nlp:
article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者组织
article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups]
locations_common = [ent for ent in article_locations if ent in common_locations]
for found_entity in terrorist_common:
for found_location in locations_common:
location_entity_dict[found_entity][found_location] += 1
location_entity_dict
I simply get nothing from the file.
Here is The text data link
Thank you!
I reproduced your example and it looks like you will get empty lists for article_terrorist_groups and terrorist_common. Therefore, you won't get the output (that I assume) you require. I changed the model (for my machine) to en_core_web_sm and I observed that the ent.label is different from ones that you are specifying in the if statement in your list comprehensions. I am almost certain this is the case whether you use spacy.load('en') or spacy.load('en_core_web_sm').
You are using if ent.label_=='PERSON' or ent.label_ =='ORG' which is leading to empty lists. You would need to change this in order for it to work. Basically, in your list comprehension for article_terrorist_groups and terrorist_common, the for loop is trying to iterate through an empty list.
If you look at the output that I posted, you will see that ent.label is not 'PERSON' or 'ORG'
Note: I would recommend adding print statements (or using a debugger) in your code to check from time to time.
My Code
import spacy
from collections import defaultdict, Counter
nlp = spacy.load('en_core_web_sm') # I changed this
def read_file_to_list(file_name):
with open(file_name, 'r') as file:
return file.readlines()
terrorism_articles = read_file_to_list('rand-terrorism-dataset.txt')
terrorism_articles_nlp = [nlp(art) for art in terrorism_articles]
common_terrorist_groups = [
'taliban',
'al - qaeda',
'hamas',
'fatah',
'plo',
'bilad al - rafidayn'
]
common_locations = [
'iraq',
'baghdad',
'kirkuk',
'mosul',
'afghanistan',
'kabul',
'basra',
'palestine',
'gaza',
'israel',
'istanbul',
'beirut',
'pakistan'
]
location_entity_dict = defaultdict(Counter)
for article in terrorism_articles_nlp:
print([(ent.lemma_, ent.label) for ent in article.ents])
Output
[('CHILE', 383), ('the Santiago Binational Center', 383), ('21,000', 394)]
[('ISRAEL', 384), ('palestinian', 381), ('five', 397), ('Masada', 384)]
[('GUATEMALA', 383), ('U.S. Marines', 381), ('Guatemala City', 384)]
truncated output in the interest of length of this answer
Because groups & locations in common_terrorist_groups and common_locations are lowercase while finded data terrorist_common and locations_common are uppercase. So just change the code if ent in common_terrorist_groups to if ent.lower() in common_terrorist_groups
common_terrorist_groups = [
'taliban',
'al - qaeda',
'hamas',
'fatah',
'plo',
'bilad al - rafidayn'
]
common_locations = [
'iraq',
'baghdad',
'kirkuk',
'mosul',
'afghanistan',
'kabul',
'basra',
'palestine',
'gaza',
'israel',
'istanbul',
'beirut',
'pakistan'
]
location_entity_dict = defaultdict(Counter)
for article in terrorism_articles_nlp:
article_terrorist_cands = [ent.lemma_ for ent in article.ents if ent.label_ == 'PERSON' or ent.label_ == 'ORG']
article_location_cands = [ent.lemma_ for ent in article.ents if ent.label_ == 'GPE']
terrorist_candidates = [ent for ent in article_terrorist_cands if ent.lower() in common_terrorist_groups]
location_candidates = [loc for loc in article_location_cands if loc.lower() in common_locations]
for found_entity in terrorist_candidates:
for found_location in location_candidates:
location_entity_dict[found_entity][found_location] += 1

Spacy Entity Rule doesn't work for cardinal (Social Security number)

I have used Entity Rule to add new label for social security number.
I even set overwrite_ents=true but it still does't recognize
I verified regular expression is correct. not sure what else I need to do
I tried before="ner" but same result
text = "My name is yuyyvb and I leave on 605 W Clinton Street. My social security 690-96-4032"
nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns([{"label": "SSN", "pattern": [{"TEXT": {"REGEX": r"\d{3}[^\w]\d{2}[^\w]\d{4}"}}]}])
nlp.add_pipe(ruler)
doc = nlp(text)
for ent in doc.ents:
print("{} {}".format(ent.text, ent.label_))
Actually, the SSN you have is tokenized by spacy into 5 chunks:
print([token.text for token in nlp("690-96-4032")])
# => ['690', '-', '96', '-', '4032']
So, either use a custom tokenizer where - between digits is not split out as a separate token, or - simpler - create a pattern for the consecutive 5 tokens:
patterns = [{"label": "SSN", "pattern": [{"TEXT": {"REGEX": r"^\d{3}$"}}, {"TEXT": "-"}, {"TEXT": {"REGEX": r"^\d{2}$"}}, {"TEXT": "-"}, {"TEXT": {"REGEX": r"^\d{4}$"}} ]}]
Full spacy demo:
import spacy
from spacy.pipeline import EntityRuler
nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp, overwrite_ents=True)
patterns = [{"label": "SSN", "pattern": [{"TEXT": {"REGEX": r"^\d{3}$"}}, {"TEXT": "-"}, {"TEXT": {"REGEX": r"^\d{2}$"}}, {"TEXT": "-"}, {"TEXT": {"REGEX": r"^\d{4}$"}} ]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
text = "My name is yuyyvb and I leave on 605 W Clinton Street. My social security 690-96-4032"
doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])
# => [('605', 'CARDINAL'), ('690-96-4032', 'SSN')]
So, {"TEXT": {"REGEX": r"^\d{3}$"}} matches a token that only consists of three digits, {"TEXT": "-"} is a - char, etc.
Overriding hyphenated numbers tokenization with spacy
If you are interested in how it can be achieved by overriding default tokenization, pay attention to the infixes: the r"(?<=[0-9])[+\-\*^](?=[0-9-])" regex make spacy split hyphen-separated numbers into separate tokens. To make 1-2-3 and 1-2 like substrings get tokenized as single tokens, remove the - from the regex. Well, you can't do that, this is much trickier: you need to replace it with 2 regexps: r"(?<=[0-9])[+*^](?=[0-9-])" and r"(?<=[0-9])-(?=-)" because of the fact the - is checked also between a digit ((?<=[0-9])) and a hyphen (see (?=[0-9-])).
So, the whole thing will look like
import spacy
from spacy.tokenizer import Tokenizer
from spacy.pipeline import EntityRuler
from spacy.util import compile_infix_regex
def custom_tokenizer(nlp):
# Take out the existing rule and replace it with a custom one:
inf = list(nlp.Defaults.infixes)
inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")
inf = tuple(inf)
infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])
infix_re = compile_infix_regex(infixes)
return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
suffix_search=nlp.tokenizer.suffix_search,
infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match,
rules=nlp.Defaults.tokenizer_exceptions)
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns([{"label": "SSN", "pattern": [{"TEXT": {"REGEX": r"^\d{3}\W\d{2}\W\d{4}$"}}]}])
nlp.add_pipe(ruler)
text = "My name is yuyyvb and I leave on 605 W Clinton Street. My social security 690-96-4032. Some 9---al"
doc = nlp(text)
print([t.text for t in doc])
# => ['My', 'name', 'is', 'yuyyvb', 'and', 'I', 'leave', 'on', '605', 'W', 'Clinton', 'Street', '.', 'My', 'social', 'security', '690-96-4032', '.', 'Some', '9', '-', '--al']
print([(ent.text, ent.label_) for ent in doc.ents])
# => [('605', 'CARDINAL'), ('690-96-4032', 'SSN'), ('9', 'CARDINAL')]
If you leave out r"(?<=[0-9])-(?=-)", the ['9', '-', '--al'] will turn into '9---al'.
NOTE you need to use ^\d{3}\W\d{2}\W\d{4}$ regex: ^ and $ match start and end of the token (as otherwise, partially matched tokens will also be identified as SSNs) and [^\w] is equal to \W.

Resources