Extracting particular word from existing Sentence - python-3.x

I have a following string
My idea is to extract particular word and update the same in different column. For e.g
I could not get an idea how to extract the same

my_string ="TT Load:Route1 TT Load for 30 out of 80 from Route 2"
description=my_string.split(":")[0]
route_start = my_string.find('Route')
route_end= my_string.find('Route')+6
route= my_string[route_start : route_end]
TTLoad = my_string[route_end:].split('TT Load')
res = [int(i) for i in TTLoad[1].split() if i.isdigit()]
TTLoad_value= res[0]
Out_Of = my_string[route_end:].split('out of')
res = [int(i) for i in Out_Of[1].split() if i.isdigit()]
Out_Of_value= res[0]
required_dict={ "GivenDescription":my_string,
"Description":description,
"Route": route,
"TTLoad" :TTLoad_value,
"Out of" : Out_Of_value}
df=pd.DataFrame.from_dict(required_dict,orient='index').T

Related

is there a method to collect data intelligently from website?

i want to get data from this link https://meshb.nlm.nih.gov/treeView
the problem is to get all the tree, we should click on + each time and for each line to get the children node of the tree,
but I want to display all the tree just on one click then i want to copy all the content.
Any ideas, please?
Well, it all depends what you mean by "intelligently". Not sure if that meets the criteria, but you might want to try this.
import json
import string
import requests
abc = string.ascii_uppercase
base_url = "https://meshb.nlm.nih.gov/api/tree/children/"
follow_url = "https://meshb.nlm.nih.gov/record/ui?ui="
tree = {}
for letter in abc[:1]:
res = requests.get(f"{base_url}{letter}").json()
tree[letter] = {
"Records": [i["RecordName"] for i in res],
"FollowURLS": [f"{follow_url}{i['RecordUI']}" for i in res],
}
print(json.dumps(tree, indent=2))
This prints:
{
"A": {
"Records": [
"Body Regions",
"Musculoskeletal System",
"Digestive System",
"Respiratory System",
"Urogenital System",
"Endocrine System",
"Cardiovascular System",
"Nervous System",
"Sense Organs",
"Tissues",
"Cells",
"Fluids and Secretions",
"Animal Structures",
"Stomatognathic System",
"Hemic and Immune Systems",
"Embryonic Structures",
"Integumentary System",
"Plant Structures",
"Fungal Structures",
"Bacterial Structures",
"Viral Structures"
],
"FollowURLS": [
"https://meshb.nlm.nih.gov/record/ui?ui=D001829",
"https://meshb.nlm.nih.gov/record/ui?ui=D009141",
"https://meshb.nlm.nih.gov/record/ui?ui=D004064",
"https://meshb.nlm.nih.gov/record/ui?ui=D012137",
"https://meshb.nlm.nih.gov/record/ui?ui=D014566",
"https://meshb.nlm.nih.gov/record/ui?ui=D004703",
"https://meshb.nlm.nih.gov/record/ui?ui=D002319",
"https://meshb.nlm.nih.gov/record/ui?ui=D009420",
"https://meshb.nlm.nih.gov/record/ui?ui=D012679",
"https://meshb.nlm.nih.gov/record/ui?ui=D014024",
"https://meshb.nlm.nih.gov/record/ui?ui=D002477",
"https://meshb.nlm.nih.gov/record/ui?ui=D005441",
"https://meshb.nlm.nih.gov/record/ui?ui=D000825",
"https://meshb.nlm.nih.gov/record/ui?ui=D013284",
"https://meshb.nlm.nih.gov/record/ui?ui=D006424",
"https://meshb.nlm.nih.gov/record/ui?ui=D004628",
"https://meshb.nlm.nih.gov/record/ui?ui=D034582",
"https://meshb.nlm.nih.gov/record/ui?ui=D018514",
"https://meshb.nlm.nih.gov/record/ui?ui=D056229",
"https://meshb.nlm.nih.gov/record/ui?ui=D056226",
"https://meshb.nlm.nih.gov/record/ui?ui=D056224"
]
}
}
If you want all of it, just remove [:1] from the loop. If there's no entry for a given letter on the page you'll get, well, an empty entry in the dictionary.
Obviously, you can dump the entire response, but that's just a proof of concept.
Try this, some parts are a bit tricky but it manages to give you the tree:
import requests as r
import operator
import string
link = 'https://meshb.nlm.nih.gov/api/tree/children/{}'
all_data = []
for i in string.ascii_uppercase:
all_data.append({'RecordName': i, 'RecordUI': '', 'TreeNumber': i, 'HasChildren': True})
res = r.get(link.format(i))
data_json = res.json()
all_data += data_json
# This request will get all the rest of the data at once, other than A-Z or A..-Z..
# This request takes time to load, depending on your network, it got like 3 million+ characters
res = r.get(link.format('.*'))
data_json = res.json()
all_data += data_json
# Sorting the data depending on the TreeNumber
all_data.sort(key=operator.itemgetter('TreeNumber'))
# Printing the tree using tabulations
for row in all_data:
l = len(row['TreeNumber'])
if l == 3:
print('\t', end='')
elif l > 3:
print('\t'*(len(row['TreeNumber'].split('.'))+1), end='')
print(row['RecordName'])

How to handle blank line,junk line and \n while converting an input file to csv file

Below is the sample data in input file. I need to process this file and turn it into a csv file. With some help, I was able to convert it to csv file. However not fully converted to csv since I am not able to handle \n, junk line(2nd line) and blank line(4th line). Also, i need help to filter transaction_type i.e., avoid "rewrite" transaction_type
{"transaction_type": "new", "policynum": 4994949}
44uu094u4
{"transaction_type": "renewal", "policynum": 3848848,"reason": "Impressed with \n the Service"}
{"transaction_type": "cancel", "policynum": 49494949, "cancel_table":[{"cancel_cd": "AU"}, {"cancel_cd": "AA"}]}
{"transaction_type": "rewrite", "policynum": 5634549}
Below is the code
import ast
import csv
with open('test_policy', 'r') as in_f, open('test_policy.csv', 'w') as out_f:
data = in_f.readlines()
writer = csv.DictWriter(
out_f,
fieldnames=[
'transaction_type', 'policynum', 'cancel_cd','reason'],lineterminator='\n',
extrasaction='ignore')
writer.writeheader()
for row in data:
dict_row = ast.literal_eval(row)
if 'cancel_table' in dict_row:
cancel_table = dict_row['cancel_table']
cancel_cd= []
for cancel_row in cancel_table:
cancel_cd.append(cancel_row['cancel_cd'])
dict_row['cancel_cd'] = ','.join(cancel_cd)
writer.writerow(dict_row)
Below is my output not considering the junk line,blank line and transaction type "rewrite".
transaction_type,policynum,cancel_cd,reason
new,4994949,,
renewal,3848848,,"Impressed with
the Service"
cancel,49494949,"AU,AA",
Expected output
transaction_type,policynum,cancel_cd,reason
new,4994949,,
renewal,3848848,,"Impressed with the Service"
cancel,49494949,"AU,AA",
Hmm I try to fix them but I do not know how CSV file work, but my small knoll age will suggest you to run this code before to convert the file.
txt = {"transaction_type": "renewal",
"policynum": 3848848,
"reason": "Impressed with \n the Service"}
newTxt = {}
for i,j in txt.items():
# local var (temporar)
lastX = ""
correctJ = ""
# check if in J is ascii white space "\n" and get it out
if "\n" in f"b'{j}'":
j = j.replace("\n", "")
# for grammar purpose check if
# J have at least one space
if " " in str(j):
# if yes check it closer (one by one)
for x in ([j[y:y+1] for y in range(0, len(j), 1)]):
# if 2 spaces are consecutive pass the last one
if x == " " and lastX == " ":
pass
# if not update correctJ with new values
else:
correctJ += x
# remember what was the last value checked
lastX = x
# at the end make J to be the correctJ (just in case J has not grammar errors)
j = correctJ
# add the corrections to a new dictionary
newTxt[i]=j
# show the resoult
print(f"txt = {txt}\nnewTxt = {newTxt}")
Termina:
txt = {'transaction_type': 'renewal', 'policynum': 3848848, 'reason': 'Impressed with \n the Service'}
newTxt = {'transaction_type': 'renewal', 'policynum': 3848848, 'reason': 'Impressed with the Service'}
Process finished with exit code 0

Python add ', ' to string and return:

fd = open(nom_fichier, 'r')
liste_chaine = fd.readlines()
liste_chaine2 = []
for item in liste_chaine:
if item not in "'noir\n','blanc\n','Humain\n', 'Ordinateur\n', 'False\n', 'True\n":
liste_chaine2.append(item)
liste_chaine2 = [i.replace('\n', '') for i in liste_chaine2]
return liste_chaine2
['3,3,blanc', '3,4,noir', '4,3,noir', '4,4,blanc']
i am reading a file and trying to return a string output exactly like :
3,3,blanc
4,3,noir
3,4,white
i cleaned the file with the code above but need to clean up this list to the required output
You can split your string and put it together again to meet your requirements:
string = '33blanc 34noir 43noir 44blanche'
result = '\n'.join(['{},{},{}'.format(v[0], v[1], v[2:]) for v in string.split()])
print(result)
3,3,blanc
3,4,noir
4,3,noir
4,4,blanche

Unknown column added in user input form

I have a simple data entry form that writes the inputs to a csv file. Everything seems to be working ok, except that there are extra columns being added to the file in the process somewhere, seems to be during the user input phase. Here is the code:
import pandas as pd
#adds all spreadsheets into one list
Batteries= ["MAT0001.csv","MAT0002.csv", "MAT0003.csv", "MAT0004.csv",
"MAT0005.csv", "MAT0006.csv", "MAT0007.csv", "MAT0008.csv"]
#User selects battery to log
choice = (int(input("Which battery? (1-8):")))
def choosebattery(c):
done = False
while not done:
if(c in range(1,9)):
return Batteries[c]
done = True
else:
print('Sorry, selection must be between 1-8')
cfile = choosebattery(choice)
cbat = pd.read_csv(cfile)
#Collect Cycle input
print ("Enter Current Cycle")
response = None
while response not in {"Y", "N", "y", "n"}:
response = input("Please enter Y or N: ")
cy = response
#Charger input
print ("Enter Current Charger")
response = None
while response not in {"SC-G", "QS", "Bosca", "off", "other"}:
response = input("Please enter one: 'SC-G', 'QS', 'Bosca', 'off', 'other'")
if response == "other":
explain = input("Please explain")
ch = response + ":" + explain
else:
ch = response
#Location
print ("Enter Current Location")
response = None
while response not in {"Rack 1", "Rack 2", "Rack 3", "Rack 4", "EV001", "EV002", "EV003", "EV004", "Floor", "other"}:
response = input("Please enter one: 'Rack 1 - 4', 'EV001 - 004', 'Floor' or 'other'")
if response == "other":
explain = input("Please explain")
lo = response + ":" + explain
else:
lo = response
#Voltage
done = False
while not done:
choice = (float(input("Enter Current Voltage:")))
modchoice = choice * 10
if(modchoice in range(500,700)):
vo = choice
done = True
else:
print('Sorry, selection must be between 50 and 70')
#add inputs to current battery dataframe
log = pd.DataFrame([[cy,ch,lo,vo]],columns=["Cycle", "Charger", "Location", "Voltage"])
clog = pd.concat([cbat,log], axis=0)
clog.to_csv(cfile, index = False)
pd.read_csv(cfile)
And I receive:
Out[18]:
Charger Cycle Location Unnamed: 0 Voltage
0 off n Floor NaN 50.0
Where is the "Unnamed" column coming from?
There's an 'unnamed' column coming from your csv. The reason most likely is that the lines in your input csv files end with a comma (i.e. your separator), so pandas interprets that as an additional (nameless) column. If that's the case, check whether your lines end with your separator. For example, if your files are separated by commas:
Column1,Column2,Column3,
val_11, val12, val12,
...
Into:
Column1,Column2,Column3
val_11, val12, val12
...
Alternatively, try specifying the index column explicitly as in this answer. I believe some of the confusion stems from pandas concat reordering your columns .

Last page not showing in scrapy

So my code (pasted) below almost does what I want. Instead, it covers 29/30 pages, and then leaves out the last. Furthermore, I would preferably have it go beyond, but the website has no button for it (the pages actually do work when you manually fill in page=31 in the link). When Depth_Limit is 29 it's all fine, but on 30 I get the following error in the command prompt:
File "C:\Users\Ewald\Scrapy\OB\OB\spiders\spider_OB.py", line 23, in parse
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[#class="volgende"]/#href').extract()[0]
IndexError: list index out of range
I've tried various approaches, but they all seem to fail me...
class OB_Crawler(CrawlSpider):
name = 'OB5'
allowed_domains = ["https://www.officielebekendmakingen.nl/"]
start_urls = ["https://zoek.officielebekendmakingen.nl/zoeken/resultaat/?zkt=Uitgebreid&pst=Tractatenblad|Staatsblad|Staatscourant|BladGemeenschappelijkeRegeling|ParlementaireDocumenten&vrt=Cybersecurity&zkd=InDeGeheleText&dpr=Alle&sdt=DatumPublicatie&ap=&pnr=18&rpp=10&_page=1&sorttype=1&sortorder=4"]
custom_settings = {
'BOT_NAME': 'OB-crawler',
'DEPTH_LIMIT': 30,
'DOWNLOAD_DELAY': 0.1
}
def parse(self, response):
s = Selector(response)
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[#class="volgende"]/#href').extract()[0]
if len(next_link):
yield self.make_requests_from_url(next_link)
posts = response.selector.xpath('//div[#class = "lijst"]/ul/li')
for post in posts:
i = TextPostItem()
i['title'] = ' '.join(post.xpath('a/#href').extract()).replace(';', '').replace(' ', '').replace('\r\n', '')
i['link'] = ' '.join(post.xpath('a/text()').extract()).replace(';', '').replace(' ', '').replace('\r\n', '')
i['info'] = ' '.join(post.xpath('a/em/text()').extract()).replace(';', '').replace(' ', '').replace('\r\n', '').replace(',', '-')
yield i
The index out of range error is the result of an incorrect xpath (you end up calling for the first item of an empty list).
change your "next_link = ... " to
next_link = 'https://zoek.officielebekendmakingen.nl/' + s.xpath('//a[contains(#class, "volgende")]/#href').extract()[0]
You need to use contains, which runs a predicate search.. filters for what you want

Resources