I'm trying to filter the usernames that are being referenced in a tweet like in the following example:
Example:
tw = 'TR #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
the desired output will be:
rt_unames = ['uname1', 'uname6']
mt_unames = ['uname2', 'uname3', 'uname4', 'uname5']
I can do something like a for loop that goes over the string like the naïve solution below:
Naïve Solution:
def find_end_idx(tw_part):
end_space_idx = len(tw)
try:
end_space_idx = tw[start_idx:].index(' ')
except Exception as e:
pass
end_dot_idx = len(tw)
try:
end_dot_idx = tw[start_idx:].index('.')
except Exception as e:
pass
end_semi_idx = len(tw)
try:
end_semi_idx = tw[start_idx:].index(',')
except Exception as e:
pass
return min(end_space_idx, end_dot_idx, end_semi_idx)
tw = 'RT #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
acc = ''
rt_unames = []
mt_unames = []
for i, c in enumerate(tw):
acc += c
if acc[::-1][:2][::-1] == 'RT':
start_idx = i+2
end_idx = find_end_idx(tw_part=tw[start_idx:])
uname = tw[start_idx:start_idx+end_idx]
if uname not in mt_unames:
rt_unames.append(uname)
acc = ''
elif acc[::-1][:1]=='#':
start_idx = i
end_idx = find_end_idx(tw_part=tw[start_idx:])
uname = tw[start_idx:start_idx+end_idx]
if uname not in rt_unames:
mt_unames.append(uname)
acc = ''
rt_unames, mt_unames
which outputs:
(['#uname1', '#uname6'], ['#uname2', '#uname3', '#uname4', '#uname5'])
Question:
As I need to apply it to every tweet in a pandas.DataFrame, I'm looking for a more elegant and fast solution to get this outcome.
I'd appreciate any suggestions.
Let's try re.findall with a regex pattern::
import re
rt_unames = re.findall(r'(?<=TR |RT )#([^,]+)', tw)
mt_unames = re.findall(r'(?<!TR |RT )#([^,]+)', tw)
In the similar way, you can use str.findall method on the column in dataframe:
df['rt_unames'] = df['tweet'].str.findall(r'(?<=TR |RT )#([^,]+)')
df['mt_unames'] = df['tweet'].str.findall(r'(?<!TR |RT )#([^,]+)')
Result:
['uname1', 'uname6']
['uname2', 'uname3', 'uname4', 'uname5']
If the format of input string is always the same, I would do it like this:
def parse_tags(str_tags):
rts = []
others = []
for tag in [tag.strip() for tag in str_tags.split(',')]:
if tag.startswith('RT'):
rts.append(tag[3:])
elif tag.startswith('#'):
others.append(tag)
return rts, others
An alternative approach using filters and list comprehension.
import re
def your_func_name(tw):
tw_list = [x.strip() for x in tw.split(",")]
rt_unames_raw = filter(lambda x: "#" in x and x.startswith("RT"),tw_list)
mt_unames_raw = filter(lambda x: x.startswith("#"),tw_list)
rt_unames = [re.sub(r"RT|#","",uname).strip() for uname in rt_unames_raw]
mt_unames = [re.sub("#","",uname).strip() for uname in mt_unames_raw]
return rt_unames, mt_unames
tw = 'RT #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
your_func_name(tw=tw)
You can use regex patterns and use the apply function on the tweet column of your dataframe
import pandas as pd
import re
pattern1 = r"(RT\s+#[^\,]+)|(TR\s+#[^\,]+)"
pattern2 = r"#[^\,]+"
df = pd.DataFrame(['TR #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'], columns=['Tweet'])
df['group1'] = df.Tweet.apply(lambda x: re.findall(pattern1, x))
df['group2'] = df.Tweet.apply(lambda x: re.findall(pattern2, x))
This is my second time, so I will try to make it as easy as possible.
tw = 'TR #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
res = tw.replace(", ", " ").split()
final = []
k = "#"
for e in res:
if e[0].lower == k.lower:
final.append(e)
stringe = str(final).replace(",", "")
stringe = stringe.replace("[", "")
stringe = stringe.replace("]", "")
stringe =stringe.replace("'", "")
print("Result is :", str(stringe))
from what I can see, you already know python, so this example should only take you a while.
Here, I use the replace function to replace all the commas (,) with blank, and use the split function, which seperates the words seperated by spaces. The result is then stored in res.
In the next few lines, I use the replace function to replace all unwanted strings like "[" and "]" and "'" , to be replaced by a blank.
Then, I simply print the result.
Hit me up at #Vishma Pratim Das on twitter if you don't understand something
Related
I have a list 'lst1' and wanted to append multiple values in a single line if exist. Can anyone help me out with this.
lst1 = [['cnl','fb123','ins54'],['ins45'],['abc','xyz'],['abc','xyz','fb765','ins567']]
adn = ['ab','cc']
fb = []
ins = []
otr = []
for lnk in lst1:
for lnk2 in lnk:
if 'fb' in lnk2:
try:
fb.append(lnk2)
except:
fb.append("")
elif 'ins' in lnk2:
try:
ins.append(lnk2)
except:
ins.append("")
elif ('fb' or 'ins') not in lnk2:
try:
otr.append(lnk2)
except:
otr.append("")
data = {}
data = {'fb': fb, 'ins': ins, 'otr': otr, 'adn': adn}
result = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data.items()]))
result.to_csv("raw_data.csv", index = False)
Expected Output:
fb ins otr adn
0 fb123 ins54 cnl ab
1 ins45 cc
2 abc,xyz
3 fb765 ins567 abc,xyz
Even, I have tried with 'extend' function but unable to get the desired output.
I don't understand why in the output example the third and fourth lines are empty ? And why are 'abc, xyz' in the second line?
Implemented based only on the description. If you want to exclude duplication, you can additionally transform the *_check list to set .
import pandas as pd
lst1 = [['cnl', 'fb123', 'ins54'], ['ins45'], ['abc', 'xyz'], ['abc', 'xyz', 'fb765', 'ins567']]
adn = ['ab', 'cc']
fb = []
ins = []
otr = []
for lnk in lst1:
fb_check = [word for word in lnk if word.startswith('fb')]
ins_check = [word for word in lnk if word.startswith('ins')]
otr_check = [word for word in lnk if not word.startswith('fb') and not word.startswith('ins')]
fb.append(','.join(fb_check) if fb_check else '')
ins.append(','.join(ins_check) if ins_check else '')
otr.append(','.join(otr_check) if otr_check else '')
while len(adn) != len(fb):
adn.append('')
data = {'fb': fb, 'ins': ins, 'otr': otr, 'adn': adn}
result = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))
print(result)
result.to_csv("raw_data.csv", index=False)
Output:
fb ins otr adn
0 fb123 ins54 cnl ab
1 ins45 cc
2 abc,xyz
3 fb765 ins567 abc,xyz
I want to replace these symbols with '-' and I know there should be a better way than doing this:
if '/' in var1:
var1= var1.replace('/', '-')
if '#' in var1:
var1= var1.replace('#', '-')
if ';' in var1:
var1 = var1.replace(';', '-')
if ':' in var1:
var1= var1.replace(':', '-')
This is what I tried, which is clearly wrong and I'm not able to properly optimize it.
str = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
print([str.replace(i,'-') for i in str])
replaceAll doesn't work, gives me an error saying str does not has that attribute.
str.replaceAll("[<>]", "")
How about using str.translate()?
# make a translation table that replaces any of "#:;/" with hyphens
hyphenator = str.maketrans({c: "-" for c in "#:;/"})
# use str.translate to apply it
print("Testing PRI/Sec (#434242332;PP:432:133423846,335)".translate(hyphenator))
Or, even faster, use a compiled regex:
compiled_re = re.compile("|".join(re.escape(i) for i in "#:;/"))
print(compiled_re.sub("-", "Testing PRI/Sec (#434242332;PP:432:133423846,335)"))
Both of these methods are much faster than the other methods proposed (at least on that input):
import re
import timeit
s = "Testing PRI/Sec (#434242332;PP:432:133423846,335)"
a = ["#", ":", ";", "/"]
hyphenator = str.maketrans({c: "-" for c in "#:;/"})
def str_translate():
s.translate(hyphenator)
def join_generator():
"".join("-" if ch in a else ch for ch in s)
def append_in_loop():
temp = ""
for i in s:
if i in a:
temp += "-"
else:
temp += i
def re_sub():
re.sub("|".join(re.escape(i) for i in a), "-", s)
def compiled_re_sub():
compiled_re.sub("-", s)
for method in [str_translate, join_generator, re_sub, append_in_loop, compiled_re_sub]:
# run a million iterations and report the total time
print("{} took a total of {}s".format(method.__name__, timeit.timeit(method)))
Results on my machine:
str_translate took a total of 1.1160085709998384s
join_generator took a total of 4.599312704987824s
re_sub took a total of 4.101858579088002s
append_in_loop took a total of 4.257988628000021s
compiled_re_sub took a total of 1.0353244650177658s
s = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
print(''.join('-' if ch in a else ch for ch in s))
Prints:
Testing PRI-Sec (-434242332-PP-432-133423846,335)
Or using re:
s = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
import re
print(re.sub('|'.join(re.escape(i) for i in a), '-', s))
Prints:
Testing PRI-Sec (-434242332-PP-432-133423846,335)
Use re package
import re
string = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
result = re.sub('[#:;/]',"-", string)
print(result)
Result:
Testing PRI-Sec (-434242332-PP-432-133423846,335)
Just loop through add each character to the temp variable unless it is in the list "a" if it is in the list just replace it by adding "-" to the variable instead.
str = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
temp = ''
for i in str:
if i in a:
temp = temp + "-"
else:
temp = temp + i
print(temp)
I'm trying to remove several words in each value of a column but nothing is happening.
stop_words = ["and","lang","naman","the","sa","ko","na",
"yan","n","yang","mo","ung","ang","ako","ng",
"ndi","pag","ba","on","un","Me","at","to",
"is","sia","kaya","I","s","sla","dun","po","b","pro"
]
newdata['Verbatim'] = newdata['Verbatim'].replace(stop_words,'', inplace = True)
I'm trying to generate a word cloud out from the result of the replacement but I am getting the same words(that doesn't mean anything but has a lot of volumne)
You can use words boundaries \b with joined values by | for regex OR:
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
newdata['Verbatim'] = newdata['Verbatim'].str.replace(pat, '')
Another solution is split values, remove stopwords and join back with sapce in lambda function:
stop_words = set(stop_words)
f = lambda x: ' '.join(w for w in x.split() if not w in stop_words)
newdata['Verbatim'] = newdata['Verbatim'].apply(f)
Sample:
stop_words = ["and","lang","naman","the","sa","ko","na",
"yan","n","yang","mo","ung","ang","ako","ng",
"ndi","pag","ba","on","un","Me","at","to",
"is","sia","kaya","I","s","sla","dun","po","b","pro"
]
newdata = pd.DataFrame({'Verbatim':['I love my lang','the boss come to me']})
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
newdata['Verbatim1'] = newdata['Verbatim'].str.replace(pat, '')
top_words = set(stop_words)
f = lambda x: ' '.join(w for w in x.split() if not w in stop_words)
newdata['Verbatim2'] = newdata['Verbatim'].apply(f)
print (newdata)
Verbatim Verbatim1 Verbatim2
0 I love my lang love my love my
1 the boss come to me boss come me boss come me
I'm getting number from a HTML, some of them are %, 4 digits and 7 digits (37.89%, 3.464, 2,193.813). I would like to save just the numbers, not the percentages, without the thousand separators (".").
list_of_rows = []
for row in table.findAll('div', attrs={'class': 'quadrado'}):
list_of_cells = []
for cell in row.findAll('span', attrs={'class': 'circulo'}):
text = cell.text
# print(text)
for cell_index in row.findAll('span', attrs={'class': 'triangulo'}):
text_index = cell_index.text
list_of_cells_index = [text, text_index]
list_of_cells_index_clean = ','.join(list_of_cells_index) # remove brackets and ''
# print(list_of_cells_index_clean)
list_of_cells.append(list_of_cells_index_clean)
list_of_rows.append(list_of_cells)
outfile = open("./list.csv", "a")
writer = csv.writer(outfile, lineterminator = '\n')
writer.writerows(list_of_rows)
I would like to get:
37.89%, 3464, 2193,813.
How can I do it?
I don't know all your input parameters, but this works for the ones that you provided.
s = ('37.89%', '3.464', '2,193.813')
for item in s:
remove_comma = item.replace(',', '')
keep_percentage = re.findall(r'\d{1,4}\.\d{1,4}%', remove_comma)
if keep_percentage:
keep_percentage = ''.join(keep_percentage)
print (keep_percentage)
else:
if (len(remove_comma)) == 5:
print (remove_comma.replace('.', ''))
else:
print (remove_comma.replace('.', ','))
**OUTPUTS**
37.89%
3464
2193,813
import pickle
def create_dict():
final_image_dict = {}
f_name = "./images/image_dict.txt"
handle = open(f_name, encoding = 'utf-8')
for line in handle:
if line.startswith(" ") : continue
terms = line.split(": ")
term = terms[0]
dict_tuple = terms[1].split(",")
caption = dict_tuple[0]
image = dict_tuple[1]
final_image_dict[term] = final_image_dict.get(term, dict_tuple)
with open("./images/final_image_dict.txt", "wb") as image_infile:
pickle.dump(final_image_dict, image_infile)
I am trying , with the above function, to create a dictionary in the format of key:(caption, image) from a text file of the following format:
addugliare: (Coil a rope = Avvolgere a spire una cima,addugliare.gif),
admiral: (classic anchor= ancora classico,admiral.gif),
aft: (verso la poppa, aft.gif),
alberatura: (mastage,alberatura.gif),
albero: (mast = albero, albero.gif),
ancore: (anchors, anchore.gif),
andatu: (tacks, andatu.gif),
armi: (sailing craft, armi.gif),
bearing: (rilevamento , bearing.gif), etc
My problem is in creating the tuple for the value.
The above gives {'mooring': [' (ormeggio', ' mooring.gif)', '\n'], 'knot(speed)': [' (nodo(velocità)', ' knot.gif)', '\n'], 'addugliare': [' (Coil a rope = Avvolgere a spire una cima', 'addugliare.gif)', rather than 'mooring': ('ormeggio','mooring.gif') which is the format that I want. Could someone please help. I have also tried (caption, image) which seems to return a tuple of a tuple which doesn't work for me either
Maybe something like this (modified to ignore blank lines and trailing whitespace):
def extractTuple(s):
s = s.strip()
n = len(s)
p = s[1:n-1].split(',')
return (p[0].strip(),p[1].strip())
def dictFromFile(fname):
f = open(fname)
lines = f.read().split('\n')
f.close()
d = {}
for line in lines:
line = line.strip()
if line.endswith(','):
line = line[:len(line)-1]
k,v = line.split(':')
d[k] = extractTuple(v)
return d
With your example data:
>>> d = dictFromFile("test.txt")
>>> for k in d: print(k,':',d[k])
admiral : ('classic anchor= ancora classico', 'admiral.gif')
armi : ('sailing craft', 'armi.gif')
addugliare : ('Coil a rope = Avvolgere a spire una cima', 'addugliare.gif')
aft : ('verso la poppa', 'aft.gif')
andatu : ('tacks', 'andatu.gif')
alberatura : ('mastage', 'alberatura.gif')
albero : ('mast = albero', 'albero.gif')
ancore : ('anchors', 'anchore.gif')
bearing : ('rilevamento', 'bearing.gif')