counting words inside a webpage - python-3.x

I need to count words that are inside a webpage using python3. Which module should I use? urllib?
Here is my Code:
def web():
f =("urllib.request.urlopen("https://americancivilwar.com/north/lincoln.html")
lu = f.read()
print(lu)

With below self explained code you can get a good starting point for counting words within a web page:
import requests
from bs4 import BeautifulSoup
from collections import Counter
from string import punctuation
# We get the url
r = requests.get("https://en.wikiquote.org/wiki/Khalil_Gibran")
soup = BeautifulSoup(r.content)
# We get the words within paragrphs
text_p = (''.join(s.findAll(text=True))for s in soup.findAll('p'))
c_p = Counter((x.rstrip(punctuation).lower() for y in text_p for x in y.split()))
# We get the words within divs
text_div = (''.join(s.findAll(text=True))for s in soup.findAll('div'))
c_div = Counter((x.rstrip(punctuation).lower() for y in text_div for x in y.split()))
# We sum the two countesr and get a list with words count from most to less common
total = c_div + c_p
list_most_common_words = total.most_common()
If you want for example the first 10 most common words you just do:
total.most_common(10)
Which in this case outputs:
In [100]: total.most_common(10)
Out[100]:
[('the', 2097),
('and', 1651),
('of', 998),
('in', 625),
('i', 592),
('a', 529),
('to', 529),
('that', 426),
('is', 369),
('my', 365)]

Related

i want to find a common character from n number string inside a single multidimensional list using python

def bibek():
test_list=[[]]
x=int(input("Enter the length of String elements using enter -: "))
for i in range(0,x):
a=str(input())
a=list(a)
test_list.append(a)
del(test_list[0]):
def filt(b):
d=['b','i','b']
if b in d:
return True
else:
return False
for t in test_list:
x=filter(filt,t)
for i in x:
print(i)
bibek()
suppose test_list=[['b','i','b'],['s','i','b'],['r','i','b']]
output should be ib since ib is common among all
an option is to use set and its methods:
test_list=[['b','i','b'],['s','i','b'],['r','i','b']]
common = set(test_list[0])
for item in test_list[1:]:
common.intersection_update(item)
print(common) # {'i', 'b'}
UPDATE: now that you have clarified your question i would to this:
from difflib import SequenceMatcher
test_list=[['b','i','b','b'],['s','i','b','b'],['r','i','b','b']]
# convert the list to simple strings
strgs = [''.join(item) for item in test_list]
common = strgs[0]
for item in strgs[1:]:
sm = SequenceMatcher(isjunk=None, a=item, b=common)
match = sm.find_longest_match(0, len(item), 0, len(common))
common = common[match.b:match.b+match.size]
print(common) # 'ibb'
the trick here is to use difflib.SequenceMatcher in order to get the longest string.
one more update after clarification of your question this time using collections.Counter:
from collections import Counter
strgs='app','bapp','sardipp', 'ppa'
common = Counter(strgs[0])
print(common)
for item in strgs[1:]:
c = Counter(item)
for key, number in common.items():
common[key] = min(number, c.get(key, 0))
print(common) # Counter({'p': 2, 'a': 1})
print(sorted(common.elements())) # ['a', 'p', 'p']

count frequent strings + python3.7

I got a question for you, first of all the code here:
from urllib import request
from collections import Counter
from nltk import word_tokenize
URL = 'https://www.gutenberg.org/files/46/46-0.txt'
RESPONSE = request.urlopen(URL)
RAW = RESPONSE.read().decode('utf8')
print('\n')
type(RAW)
print('\n')
len(RAW)
TOKENS = word_tokenize(RAW)
print(type(TOKENS))
X = print(len(TOKENS))
print(TOKENS[:X])
print('\n')
c = Counter(RAW)
print(c.most_common(30))
Here is the first Output, I get. With that one I am satisfied.
['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'A', 'Christmas', 'Carol', ',', 'by', 'Charles',...]
Here is the second part of the output which do not makes me satisfied:
[(' strong text', 28438), ('e', 16556), ('t', 11960), ('o', 10940), ('a', 10092), ('n', 8868), ('i', 8791),...]
Here is my question: As you can see I am counting the most frequently occuring strings in a text, but the Problem is I want to count the whole elements of the list of words: The final part of second output should look something like that:
[('Dickens', 28438), ('Project', 16556), ('Gutenberg', 11960),...]
and not as you can see above in the second part of output. I want to show the 30 most frequently used Words in the text, and not parts of elements in elements of the list.
Do you know how I can solve that Problem? Thanks for helping.
Try changing this one
c = Counter(TOKENS)
Here attached your full code with change
from urllib import request
from collections import Counter
from nltk import word_tokenize
URL = 'https://www.gutenberg.org/files/46/46-0.txt'
RESPONSE = request.urlopen(URL)
RAW = RESPONSE.read().decode('utf8')
print('\n')
type(RAW)
print('\n')
len(RAW)
TOKENS = word_tokenize(RAW)
print(type(TOKENS))
X = print(len(TOKENS))
print(TOKENS[:X])
print('\n')
c = Counter(TOKENS)
print(c.most_common(500))

Python - Calculate word frequency in percentage

I have a text where I have calculated the number of words and the frequency of the words. Now I have to display the top 7 percentage-wise. I have no clue how to do that. I know how to calcuate percentage, part/whole, but not sure how to write the code. I have done the sorting value-wise below.
def word_frequency():
"""
Function for word frequency
"""
d = dict()
with open(TEXT, "r") as f:
for line in f:
words = line.split()
for w in words:
if w in d:
d[w] += 1
else:
d[w] = 1
dict_list = sorted(d.items(), key = itemgetter(1), reverse = True)
print(dict_list[0:7])
This gives me this list:
[('the', 12), ('to', 8), ('of', 6), ('and', 5), ('a', 4), ('in', 4), ('Phil', 3)]
But how to calculate and present them with percentage instead of the value?
The word count of the text is 199.
Regards
EDIT: NEW REVISED CODE
def word_frequency():
"""
Function for word frequency
"""
d = dict()
with open(TEXT, "r") as f:
for line in f:
words = line.split()
for w in words:
if w in d:
d[w] += round(1/1.99, 1)
else:
d[w] = round(1/1.99, 1)
dict_list = sorted(d.items(), key = itemgetter(1), reverse = True)
print(dict_list[0:7])
Gives me this list:
[('the', 6.0), ('to', 4.0), ('of', 3.0), ('and', 2.5), ('a', 2.0), ('in', 2.0), ('Phil', 1.5)]
I have the percentage-ish now but is there a way to present it in a nicer fashion?
Like:
the 6%
to 4%
of 3%
and 2.5%
a 2%
in 2%
Phil 1.5%
Alternatively you can use a Counter from collections to count frequencies of words.
from operator import itemgetter
from collections import Counter
def most_common(instances):
"""Returns a list of (instance, count) sorted in total order and then from most to least common"""
return sorted(sorted(Counter(instances).items(), key=itemgetter(0)), key=itemgetter(1), reverse=True)
Utilizing that most_common function, you can do like you said "calculate percentage, part/whole". Which you do by iterating through the word and it's frequency and divides it by the total amount of words.
# words = list of strings
frequencies = most_common(words)
percentages = [(instance, count / len(words)) for instance, count in frequencies]
Depending on your use case, then re.findall(r"\w+", text) might not be the best approach to extracting words.
To get the top 7 words, you can slice percentages, by doing percentages[:7].
import re
text = "Alice opened the door and found that it led into a small passage, not much larger than a rat-hole: she knelt down and looked along the passage into the loveliest garden you ever saw."
words = re.findall(r"\w+", text)
frequencies = most_common(words)
percentages = [(instance, count / len(words)) for instance, count in frequencies]
for word, percentage in percentages[:7]:
print("%s %.2f%%" % (word, percentage * 100))
Which outputs:
the 8.57%
a 5.71%
and 5.71%
into 5.71%
passage 5.71%
Alice 2.86%
along 2.86%
If you want the same word in different casings, to count as the same. Then you can normalize all the words prior to calling most_common.
import unicodedata
def normalize_caseless(text):
return unicodedata.normalize("NFKD", text.casefold())
Then:
words = ...
Becomes:
words = list(map(normalize_caseless, ...))
Then a string containing the same word in different casings like this:
text = "Hello Test test TEST test TeSt"
Results in:
test 83.33%
hello 16.67%
Instead of:
test 33.33%
Hello 16.67%
TEST 16.67%
TeSt 16.67%
Test 16.67%
You can enumerate items in the dictionary
for k, v in dict_list.items():
percent = str(v) + ' %'
result = k + ' ' + percent
print(result)

Unable to read value for variable outside loop in python

I am trying to create a list of dictionaries that contain lists of words at 'body' and 'summ' keys using spacy. I am also using BeautifulSoup since the actual data is raw html.
This i what I have so far
from pymongo import MongoClient
from bs4 import BeautifulSoup as bs
import spacy
import string
clt = MongoClient('localhost')
db1 = clt['mchack']
db2 = clt['clean_data']
nlp = spacy.load('en')
valid_shapes = ['X.X','X.X.','X.x','X.x.','x.x','x.x.','x.X','x.X.']
cake = list()
sent_x = list()
temp_b = list()
temp_s = list()
sent_y = list()
table = str.maketrans(dict.fromkeys(string.punctuation))
for item in db1.article.find().limit(1):
finale_doc = {}
x = bs(item['news']['article']['Body'], 'lxml')
y = bs(item['news']['article']['Summary'], 'lxml')
for content in x.find_all('p'):
v = content.text
v = v.translate(table)
sent_x.append(v)
body = ' '.join(sent_x)
for content in y.find_all('p'):
v = content.text
v = v.translate(table)
sent_y.append(v)
summ = ' '.join(sent_y)
b_nlp = nlp(body)
s_nlp = nlp(summ)
for token in b_nlp:
if token.is_alpha:
temp_b.append(token.text.lower())
elif token.shape_ in valid_shapes:
temp_b.append(token.text.lower())
elif token.pos_=='NUM':
temp_b.append('<NUM>')
elif token.pos_=="<SYM>":
temp_b.append('<SYM>')
for token in s_nlp:
if token.is_alpha:
temp_s.append(token.text.lower())
elif token.shape_ in valid_shapes:
temp_s.append(token.text.lower())
elif token.pos_=='NUM':
temp_s.append('<NUM>')
elif token.pos_=="<SYM>":
temp_s.append('<SYM>')
finale_doc.update({'body':temp_b,'summ':temp_s})
cake.append(finale_doc)
print(cake)
del sent_x[:]
del sent_y[:]
del temp_b[:]
del temp_s[:]
del finale_doc
print(cake)
The first print statement gives proper output
'summ': ['as', 'per', 'the', 'budget', 'estimates', 'we', 'are', 'going', 'to', 'spend', 'rs', '<NUM>', 'crore', 'in', 'the', 'next', 'year'],
'body': ['central', 'government', 'has', 'proposed', 'spendings', 'worth', 'over', 'rs', '<NUM>', 'crore', 'on', 'medical', 'and', 'cash', 'benefits', 'for', 'workers', 'and', 'family', 'members']}]
However, after emptying the lists sent_x, sent_y, temp_b and temp_s, the output comes:
[{'summ': [], 'body': []}]
You keep passing the references to temp_b and temp_s. That's why after emptying these lists cake's content also changes (values of the dictionary are the same objects as temp_b and temp_s)!
You simply need to make a copy before appending the finale_doc dict to cake list.
finale_doc.update({'body': list(temp_b), 'summ': list(temp_s)})
You should try creating a minimal reproducible version of this, as it would meet stack overflow guidelines and you would be likely to answer your own problem.
I think what you are asking is this:
How can I empty a list without changing other instances of that list?
I made some code and I think it should work:
items = []
contents = []
for value in (1, 2):
contents.append(value)
items.append(contents)
print(contents)
del contents[:]
print(items)
This prints [1], [2] like I want, but then it prints [[], []] instead of [[1], [2]].
Then I could answer your question:
Objects (including lists) are permanent, this won't work
Instead of modifying (adding to and then deleting) the same list, you probably want to create a new list inside the loop. You can verify this by looking at id(contents) and id(items[0]), etc., and see they are all the same list. You can even do contents.append(None); print(items) and see that you now have [None, None].
Try doing
for ...
contents = []
contents.append(value)
instead of
contents = []
for ...
del contents[:]
Edit: Another answer suggests making a copy of the values as you add them. This will work, but in your case I feel that making a copy and then nulling is unnecessarily complicated. This might be appropriate if you continued to add to the list.

Frequency of ngrams (strings) in tokenized text

I have a set of unique ngrams (list called ngramlist) and ngram tokenized text (list called ngrams). I want to construct a new vector, freqlist, where each element of freqlist is the fraction of ngrams that is equal to that element of ngramlist. I wrote the following code that gives the correct output, but I wonder if there is a way to optimize it:
freqlist = [
sum(int(ngram == ngram_condidate)
for ngram_condidate in ngrams) / len(ngrams)
for ngram in ngramlist
]
I imagine there is a function in nltk or elsewhere that does this faster but I am not sure which one.
Thanks!
Edit: for what it's worth the ngrams are producted as joined output of nltk.util.ngrams and ngramlist is just a list made from set of all found ngrams.
Edit2:
Here is reproducible code to test the freqlist line (the rest of the code is not really what I care about)
from nltk.util import ngrams
import wikipedia
import nltk
import pandas as pd
articles = ['New York City','Moscow','Beijing']
tokenizer = nltk.tokenize.TreebankWordTokenizer()
data={'article':[],'treebank_tokenizer':[]}
for article in articles:
data['article' ].append(wikipedia.page(article).content)
data['treebank_tokenizer'].append(tokenizer.tokenize(data['article'][-1]))
df=pd.DataFrame(data)
df['ngrams-3']=df['treebank_tokenizer'].map(
lambda x: [' '.join(t) for t in ngrams(x,3)])
ngramlist = list(set([trigram for sublist in df['ngrams-3'].tolist() for trigram in sublist]))
df['freqlist']=df['ngrams-3'].map(lambda ngrams_: [sum(int(ngram==ngram_condidate) for ngram_condidate in ngrams_)/len(ngrams_) for ngram in ngramlist])
You can probably optimize this a bit by pre-computing some quantities and using a Counter. This will be especially useful if most of the elements in ngramlist are contained in ngrams.
freqlist = [
sum(int(ngram == ngram_candidate)
for ngram_candidate in ngrams) / len(ngrams)
for ngram in ngramlist
]
You certainly don't need to iterate over ngrams every single time you check an ngram. One pass over ngrams will make this algorighm O(n) instead of the O(n2) one you have now. Remember, shorter code is not necessarily better or more efficient code:
from collections import Counter
...
counter = Counter(ngrams)
size = len(ngrams)
freqlist = [counter.get(ngram, 0) / size for ngram in ngramlist]
To use this function properly, you would have to write a def function instead of a lambda:
def count_ngrams(ngrams):
counter = Counter(ngrams)
size = len(ngrams)
freqlist = [counter.get(ngram, 0) / size for ngram in ngramlist]
return freqlist
df['freqlist'] = df['ngrams-3'].map(count_ngrams)
Firstly, don't pollute your imported functions by overriding them and using them as variables, keep the ngrams name as the function, and use something else as variable.
import time
from functools import partial
from itertools import chain
from collections import Counter
import wikipedia
import pandas as pd
from nltk import word_tokenize
from nltk.util import ngrams
Next the steps before the line you're asking in the original question might be a little inefficient, you can clean them up, make them easier to read and measure them as such:
# Downloading the articles.
titles = ['New York City','Moscow','Beijing']
start = time.time()
df = pd.DataFrame({'article':[wikipedia.page(title).content for title in titles]})
end = time.time()
print('Downloading wikipedia articles took', end-start, 'seconds')
And then:
# Tokenizing the articles
start = time.time()
df['tokens'] = df['article'].apply(word_tokenize)
end = time.time()
print('Tokenizing articles took', end-start, 'seconds')
Then:
# Extracting trigrams.
trigrams = partial(ngrams, n=3)
start = time.time()
# There's no need to flatten them to strings, you could just use list()
df['trigrams'] = df['tokens'].apply(lambda x: list(trigrams(x)))
end = time.time()
print('Extracting trigrams took', end-start, 'seconds')
Finally, to the last line
# Instead of a set, we use a Counter here because
# we can use an intersection between Counter objects later.
# see https://stackoverflow.com/questions/44012479/intersection-of-two-counters
all_trigrams = Counter(chain(*df['trigrams']))
# More often than not, you don't need to keep all the
# zeros in the vectors (aka dense vector),
# you could actually get the non-zero sparse vectors
# as a dict as such
df['trigrams_count'] = df['trigrams'].apply(lambda x: Counter(x) & all_trigrams)
# Now to normalize the count, simply do:
def featurize(list_of_ngrams):
nonzero_features = Counter(list_of_ngrams) & all_trigrams
total = len(list_of_ngrams)
return {ng:count/total for ng, count in nonzero_features.items()}
df['trigrams_count_normalize'] = df['trigrams'].apply(featurize)

Resources