How do I append a text changing the number format? - python-3.x

I'm getting number from a HTML, some of them are %, 4 digits and 7 digits (37.89%, 3.464, 2,193.813). I would like to save just the numbers, not the percentages, without the thousand separators (".").
list_of_rows = []
for row in table.findAll('div', attrs={'class': 'quadrado'}):
list_of_cells = []
for cell in row.findAll('span', attrs={'class': 'circulo'}):
text = cell.text
# print(text)
for cell_index in row.findAll('span', attrs={'class': 'triangulo'}):
text_index = cell_index.text
list_of_cells_index = [text, text_index]
list_of_cells_index_clean = ','.join(list_of_cells_index) # remove brackets and ''
# print(list_of_cells_index_clean)
list_of_cells.append(list_of_cells_index_clean)
list_of_rows.append(list_of_cells)
outfile = open("./list.csv", "a")
writer = csv.writer(outfile, lineterminator = '\n')
writer.writerows(list_of_rows)
I would like to get:
37.89%, 3464, 2193,813.
How can I do it?

I don't know all your input parameters, but this works for the ones that you provided.
s = ('37.89%', '3.464', '2,193.813')
for item in s:
remove_comma = item.replace(',', '')
keep_percentage = re.findall(r'\d{1,4}\.\d{1,4}%', remove_comma)
if keep_percentage:
keep_percentage = ''.join(keep_percentage)
print (keep_percentage)
else:
if (len(remove_comma)) == 5:
print (remove_comma.replace('.', ''))
else:
print (remove_comma.replace('.', ','))
**OUTPUTS**
37.89%
3464
2193,813

Related

Filter user names from a string

I'm trying to filter the usernames that are being referenced in a tweet like in the following example:
Example:
tw = 'TR #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
the desired output will be:
rt_unames = ['uname1', 'uname6']
mt_unames = ['uname2', 'uname3', 'uname4', 'uname5']
I can do something like a for loop that goes over the string like the naïve solution below:
Naïve Solution:
def find_end_idx(tw_part):
end_space_idx = len(tw)
try:
end_space_idx = tw[start_idx:].index(' ')
except Exception as e:
pass
end_dot_idx = len(tw)
try:
end_dot_idx = tw[start_idx:].index('.')
except Exception as e:
pass
end_semi_idx = len(tw)
try:
end_semi_idx = tw[start_idx:].index(',')
except Exception as e:
pass
return min(end_space_idx, end_dot_idx, end_semi_idx)
tw = 'RT #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
acc = ''
rt_unames = []
mt_unames = []
for i, c in enumerate(tw):
acc += c
if acc[::-1][:2][::-1] == 'RT':
start_idx = i+2
end_idx = find_end_idx(tw_part=tw[start_idx:])
uname = tw[start_idx:start_idx+end_idx]
if uname not in mt_unames:
rt_unames.append(uname)
acc = ''
elif acc[::-1][:1]=='#':
start_idx = i
end_idx = find_end_idx(tw_part=tw[start_idx:])
uname = tw[start_idx:start_idx+end_idx]
if uname not in rt_unames:
mt_unames.append(uname)
acc = ''
rt_unames, mt_unames
which outputs:
(['#uname1', '#uname6'], ['#uname2', '#uname3', '#uname4', '#uname5'])
Question:
As I need to apply it to every tweet in a pandas.DataFrame, I'm looking for a more elegant and fast solution to get this outcome.
I'd appreciate any suggestions.
Let's try re.findall with a regex pattern::
import re
rt_unames = re.findall(r'(?<=TR |RT )#([^,]+)', tw)
mt_unames = re.findall(r'(?<!TR |RT )#([^,]+)', tw)
In the similar way, you can use str.findall method on the column in dataframe:
df['rt_unames'] = df['tweet'].str.findall(r'(?<=TR |RT )#([^,]+)')
df['mt_unames'] = df['tweet'].str.findall(r'(?<!TR |RT )#([^,]+)')
Result:
['uname1', 'uname6']
['uname2', 'uname3', 'uname4', 'uname5']
If the format of input string is always the same, I would do it like this:
def parse_tags(str_tags):
rts = []
others = []
for tag in [tag.strip() for tag in str_tags.split(',')]:
if tag.startswith('RT'):
rts.append(tag[3:])
elif tag.startswith('#'):
others.append(tag)
return rts, others
An alternative approach using filters and list comprehension.
import re
def your_func_name(tw):
tw_list = [x.strip() for x in tw.split(",")]
rt_unames_raw = filter(lambda x: "#" in x and x.startswith("RT"),tw_list)
mt_unames_raw = filter(lambda x: x.startswith("#"),tw_list)
rt_unames = [re.sub(r"RT|#","",uname).strip() for uname in rt_unames_raw]
mt_unames = [re.sub("#","",uname).strip() for uname in mt_unames_raw]
return rt_unames, mt_unames
tw = 'RT #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
your_func_name(tw=tw)
You can use regex patterns and use the apply function on the tweet column of your dataframe
import pandas as pd
import re
pattern1 = r"(RT\s+#[^\,]+)|(TR\s+#[^\,]+)"
pattern2 = r"#[^\,]+"
df = pd.DataFrame(['TR #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'], columns=['Tweet'])
df['group1'] = df.Tweet.apply(lambda x: re.findall(pattern1, x))
df['group2'] = df.Tweet.apply(lambda x: re.findall(pattern2, x))
This is my second time, so I will try to make it as easy as possible.
tw = 'TR #uname1, #uname2, #uname3, text1, text2, #uname4, text3, #uname5, RT #uname6'
res = tw.replace(", ", " ").split()
final = []
k = "#"
for e in res:
if e[0].lower == k.lower:
final.append(e)
stringe = str(final).replace(",", "")
stringe = stringe.replace("[", "")
stringe = stringe.replace("]", "")
stringe =stringe.replace("'", "")
print("Result is :", str(stringe))
from what I can see, you already know python, so this example should only take you a while.
Here, I use the replace function to replace all the commas (,) with blank, and use the split function, which seperates the words seperated by spaces. The result is then stored in res.
In the next few lines, I use the replace function to replace all unwanted strings like "[" and "]" and "'" , to be replaced by a blank.
Then, I simply print the result.
Hit me up at #Vishma Pratim Das on twitter if you don't understand something

Print the last occurrence of a word in a TXT in Python

I'm working on extracting data from a .txt file, and I want to pick up the last occurrence of a certain word in the whole file. In this case, I get three occurrences of the words D / DÑA and Tfno but I want only the last one of each one and print it.
def extract(in_filename):
if not os.path.exists(in_filename):
print("ERROR: Input file does not exist ❌ ")
sys.exit()
with open(in_filename, 'r') as file:
rows = file.readlines()
for row in rows:
if re.match('D/DÑA', row):
row_parse = row.strip()
print(row_parse)
elif re.match('Tfno', row):
row_parse = row.strip()
print(row_parse)
extract('apd29.txt')
The output is:
D/DÑA: PRUEBA PRUEBITA
Tfno: 666666666
D/DÑA: PRUEBA PRUEBITA
Tfno: 666666666
D/DÑA: PRUEBA PRUEBITA <-- I need this
Tfno: 666666666 <-- I need this
I expect the output:
D/DÑA: PRUEBA PRUEBITA
Tfno: 666666666
Use reversed
Ex:
def extract(in_filename):
if not os.path.exists(in_filename):
print("ERROR: Input file does not exist ❌ ")
sys.exit()
with open(in_filename, 'r') as file:
rows = file.readlines()
for row in reversed(rows): #!Update
if re.match(r'D/DÑA', row):
row_parse = row.strip()
print(row_parse)
break
extract('apd29.txt')
Assigning a variable outside of for loop would work in this situation.
result = "not found"
with open(in_filename, 'r') as file:
rows = file.readlines()
for row in rows:
if re.match('D/DÑA', row):
row_strip = row.strip()
row_parse = row_strip.rfind('D/DÑA')
result = row_parse
print(result)

How to scramble/shuffle/randomize all the letters of a string in python except the first and the last letter?

For example :
Example 1:
string = "Jack and the bean stalk."
updated_string = "Jcak and the baen saltk."
Example 2:
string = "Hey, Do you want to boogie? Yes, Please."
updated_string = "Hey, Do you wnat to bogoie? Yes, Palsee."
Now this string is stored in a file.
I want to read this string from the file. And write the updated string back in the file at same positions.
Letters of each word of the string with length greater than 3 must be scrambled/shuffled while keeping first and last letter as it is. Also if there is any punctuation mark the punctuation mark stays as it is.
My approach:
import random
with open("path/textfile.txt","r+") as file:
file.seek(0)
my_list = []
for line in file.readlines():
word = line.split()
my_list.append(word)
scrambled_list =[]
for i in my_list:
if len(i) >3:
print(i)
s1 = i[1]
s2 = i[-1]
s3 = i[1:-1]
random.shuffle(s3)
y = ''.join(s3)
z = s1+y+s2+' '
print(z)
This is one approach.
Demo:
from random import shuffle
import string
punctuation = tuple(string.punctuation)
for line in file.readlines(): #Iterate lines
res = []
for i in line.split(): #Split sentence to words
punch = False
if len(i) >= 4: #Check if word is greater than 3 letters
if i.endswith(punctuation): #Check if words ends with punctuation
val = list(i[1:-2]) #Exclude last 2 chars
punch = True
else:
val = list(i[1:-1]) #Exclude last 1 chars
shuffle(val) #Shuffle letters excluding the first and last.
if punch:
res.append("{0}{1}{2}".format(i[0], "".join(val), i[-2:]))
else:
res.append("{0}{1}{2}".format(i[0], "".join(val), i[-1]))
else:
res.append(i)
print(" ".join(res))

python 3 tab-delimited file adds column file.write

I'm writing string entries to a tab-delimited file in Python 3. The code that I use to save the content is:
savedir = easygui.diropenbox()
savefile = input("Please type the filename (including extension): ")
file = open(os.path.join(savedir, savefile), "w", encoding="utf-8")
file.write("Number of entities not found: " + str(missing_count) + "\n")
sep = "\t"
for entry in entities:
file.write(entry[0]+"\t")
for item in entry:
file.write(sep.join(item[0]))
file.write("\t")
file.write("\n")
file.close()
The file saves properly. There are no errors or warnings sent to the terminal. When I open the file, I find an extra column has been saved to the file.
Query | Extra | Name
Abu-Jamal, Mumia | A | Mumia Abu-Jamal
Anderson, Walter | A | Walter Inglis Anderson
Anderson, Walter | A | Walter Inglis Anderson
I've added vertical bars between the tabs for clarity; they don't normally appear there. As well, I have removed a few columns at the end. The column between the vertical bars is not supposed to be there. The document that is saved to file is longer than three lines. On each line, the extra column is the first letter of the Query column. Hence, we have A's in these three examples.
entry[0] corresponds exactly to the value in the Query column.
sep.join(item[0]) corresponds exactly to columns 3+.
Any idea why I would be getting this extra column?
Edit: I'm adding the full code for this short script.
# =============================================================================
# Code to query DBpedia for named entities.
#
# =============================================================================
import requests
import xml.etree.ElementTree as et
import csv
import os
import easygui
import re
# =============================================================================
# Default return type is XML. Others: json.
# Classes are: Resource (general), Place, Person, Work, Species, Organization
# but don't include resource as one of the
# =============================================================================
def urlBuilder(query, queryClass="unknown", returns=10):
prefix = 'http://lookup.dbpedia.org/api/search/KeywordSearch?'
#Selects the appropriate QueryClass for the url
if queryClass == 'place':
qClass = 'QueryClass=place'
elif queryClass == 'person':
qClass = 'QueryClass=person'
elif queryClass == 'org':
qClass = 'QueryClass=organization'
else:
qClass = 'QueryClass='
#Sets the QueryString
qString = "QueryString=" + str(query)
#sets the number of returns
qHits = "MaxHits=" + str(returns)
#full url
dbpURL = prefix + qClass + "&" + qString + "&" + qHits
return dbpURL
#takes a xml doc as STRING and returns an array with the name and the URI
def getdbpRecord(xmlpath):
root = et.fromstring(xmlpath)
dbpRecord = []
for child in root:
temp = []
temp.append(child[0].text)
temp.append(child[1].text)
if child[2].text is None:
temp.append("Empty")
else:
temp.append(findDates(child[2].text))
dbpRecord.append(temp)
return dbpRecord
#looks for a date with pattern: 1900-01-01 OR 01 January 1900 OR 1 January 1900
def findDates(x):
pattern = re.compile('\d{4}-\d{2}-\d{2}|\d{2}\s\w{3,9}\s\d{4}|\d{1}\s\w{3,9}\s\d{4}')
returns = pattern.findall(x)
if len(returns) > 0:
return ";".join(returns)
else:
return "None"
#%%
# =============================================================================
# Build and send get requests
# =============================================================================
print("Please select the CSV file that contains your data.")
csvfilename = easygui.fileopenbox("Please select the CSV file that contains your data.")
lookups = []
name_list = csv.reader(open(csvfilename, newline=''), delimiter=",")
for name in name_list:
lookups.append(name)
#request to get the max number of returns from the user.
temp = input("Specify the maximum number of returns desired: ")
if temp.isdigit():
maxHits = temp
else:
maxHits = 10
queries = []
print("Building queries. Please wait.")
for search in lookups:
if len(search) == 2:
queries.append([search[0], urlBuilder(query=search[0], queryClass=search[1], returns=maxHits)])
else:
queries.append([search, urlBuilder(query=search, returns=maxHits)])
responses = []
print("Gathering responses. Please wait.")
for item in queries:
response = requests.get(item[1])
data = response.content.decode("utf-8")
responses.append([item[0], data])
entities = []
missing_count = 0
for item in responses:
temp = []
if len(list(et.fromstring(item[1]))) > 0:
entities.append([item[0], getdbpRecord(item[1])])
else:
missing_count += 1
print("There are " + str(missing_count) + " entities that were not found.")
print("Please select the destination folder for the results of the VIAF lookup.")
savedir = easygui.diropenbox("Please select the destination folder for the results of the VIAF lookup.")
savefile = input("Please type the filename (including extension): ")
file = open(os.path.join(savedir, savefile), "w", encoding="utf-8")
file.write("Number of entities not found: " + str(missing_count) + "\n")
sep = "\t"
for entry in entities:
file.write(entry[0]+"\t")
for item in entry:
file.write(sep.join(item[0]))
file.write("\t")
file.write("\n")
file.close()

How to avoid repetion with my code

I've written a code that extracts all the words from two files, and only returns the words that are in both of the file.
However, i have done some repetition and that is not considered a good style, so i wondering if it would be possible to avoid this with my code?
import re
def print_common_words(filename_1, filename_2):
try:
input_file = open(filename_1, 'r')
source_string = input_file.read().lower()
input_file.close()
all_words1 = set(re.findall('[a-zA-Z]+', source_string))
input_file = open(filename_2, 'r') #Repetition
source_string = input_file.read().lower() #Repetition
input_file.close() #Repetition
all_words2 = set(re.findall('[a-zA-Z]+', source_string)) #Repetition
intersection_list = all_words1.intersection(all_words2)
union_list = []
for word in intersection_list:
union_list += [word]
union_list.sort()
for i in union_list:
print(i)
except FileNotFoundError:
print("A file could not be found.")
Use a method to factor out the duplicated code.
def get_file(file):
input_file = open(file, 'r')
source_string = input_file.read().lower()
input_file.close()
return set(re.findall('[a-zA-Z]+', source_string))
Call it like:
all_words1 = get_file(filename_1)
all_words2 = get_file(filename_2)
Eg:
all_words1 = get_file(filename_1)
all_words2 = get_file(filename_2)
intersection_list = all_words1.intersection(all_words2)
union_list = []
for word in intersection_list:
union_list += [word]
union_list.sort()
for i in union_list:
print(i)

Resources