I have this code that outputs coordinates for a port:
import urllib
import urllib.request as request
import re
a = input("What country is your port in?: ")
b = input("What is the name of the port?: ")
url = "http://ports.com/"
country = ["united-kingdom","greece"]
ports = ["port-of-eleusis","portsmouth-continental-ferry-port","poole-harbour"]
totalurl = "http://ports.com/" + a + "/" + b + "/"
htmlfile = urllib.request.urlopen(totalurl)
htmltext = htmlfile.read()
regex = '<strong>Coordinates:</strong>(.*?)</span>'
pattern = re.compile(regex)
with urllib.request.urlopen(totalurl) as response:
html = htmltext.decode()
num = re.findall(pattern, html)
print(num)
The output is correct and readable but I need the coordinates to something like this format: 39°09'24.6''N 175°37'55.8''W instead of :
>>> [' 50°48′41.04″N 1°5′31.31″W']
Your error is caused because HTML internally uses these codes to display specific unicode characters, while python does not. To fix this, replace print(num) with print(list(i.replace('°', "°").replace('′',"′").replace('″',"″") for i in num))
This essentially replaces ° with °, ′ with ′, and ″ with ″.
>>> print(list(i.replace('°', "°").replace('′',"′").replace('″',"″") for i in num))
[" 50°48′41.04″N 1°5′31.31″W"]
>>>
Related
I want to replace these symbols with '-' and I know there should be a better way than doing this:
if '/' in var1:
var1= var1.replace('/', '-')
if '#' in var1:
var1= var1.replace('#', '-')
if ';' in var1:
var1 = var1.replace(';', '-')
if ':' in var1:
var1= var1.replace(':', '-')
This is what I tried, which is clearly wrong and I'm not able to properly optimize it.
str = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
print([str.replace(i,'-') for i in str])
replaceAll doesn't work, gives me an error saying str does not has that attribute.
str.replaceAll("[<>]", "")
How about using str.translate()?
# make a translation table that replaces any of "#:;/" with hyphens
hyphenator = str.maketrans({c: "-" for c in "#:;/"})
# use str.translate to apply it
print("Testing PRI/Sec (#434242332;PP:432:133423846,335)".translate(hyphenator))
Or, even faster, use a compiled regex:
compiled_re = re.compile("|".join(re.escape(i) for i in "#:;/"))
print(compiled_re.sub("-", "Testing PRI/Sec (#434242332;PP:432:133423846,335)"))
Both of these methods are much faster than the other methods proposed (at least on that input):
import re
import timeit
s = "Testing PRI/Sec (#434242332;PP:432:133423846,335)"
a = ["#", ":", ";", "/"]
hyphenator = str.maketrans({c: "-" for c in "#:;/"})
def str_translate():
s.translate(hyphenator)
def join_generator():
"".join("-" if ch in a else ch for ch in s)
def append_in_loop():
temp = ""
for i in s:
if i in a:
temp += "-"
else:
temp += i
def re_sub():
re.sub("|".join(re.escape(i) for i in a), "-", s)
def compiled_re_sub():
compiled_re.sub("-", s)
for method in [str_translate, join_generator, re_sub, append_in_loop, compiled_re_sub]:
# run a million iterations and report the total time
print("{} took a total of {}s".format(method.__name__, timeit.timeit(method)))
Results on my machine:
str_translate took a total of 1.1160085709998384s
join_generator took a total of 4.599312704987824s
re_sub took a total of 4.101858579088002s
append_in_loop took a total of 4.257988628000021s
compiled_re_sub took a total of 1.0353244650177658s
s = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
print(''.join('-' if ch in a else ch for ch in s))
Prints:
Testing PRI-Sec (-434242332-PP-432-133423846,335)
Or using re:
s = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
import re
print(re.sub('|'.join(re.escape(i) for i in a), '-', s))
Prints:
Testing PRI-Sec (-434242332-PP-432-133423846,335)
Use re package
import re
string = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
result = re.sub('[#:;/]',"-", string)
print(result)
Result:
Testing PRI-Sec (-434242332-PP-432-133423846,335)
Just loop through add each character to the temp variable unless it is in the list "a" if it is in the list just replace it by adding "-" to the variable instead.
str = 'Testing PRI/Sec (#434242332;PP:432:133423846,335)'
a = ['#',':',';','/']
temp = ''
for i in str:
if i in a:
temp = temp + "-"
else:
temp = temp + i
print(temp)
Ik there are many questions like this but the answers are all specific and can only fix the solution for the persons specific script.
I am currently trying to print a bunch of info from supremenewyork.com
from the uk website. This script can succsesfully print all the info I want from supreme us but when I added the proxy script I starte to get alot of errors.
I know the prxy script works becuase I tested it on a small scipt and It was able to pull info that was on supreme uk and didnt exist on supreme us
Here is my script.
import requests
from bs4 import BeautifulSoup
UK_Proxy1 = raw_input('UK http Proxy1: ')
UK_Proxy2 = raw_input('UK http Proxy2: ')
proxies = {
'http': 'http://' + UK_Proxy1 + '',
'https': 'http://' + UK_Proxy2 + '',
}
categorys = ['jackets','shirts','tops_sweaters','sweatshirts','pants','shorts','t- shirts','hats','hats','bags','accessories','shoes','skate']
catNumb = 0
altArray = []
nameArray = []
styleArray = []
for cat in categorys:
catStr = str(categorys[catNumb])
cUrl = 'http://www.supremenewyork.com/shop/all/' + catStr
proxy_script = requests.get((cUrl.text), proxies=proxies)
bSoup = BeautifulSoup(proxy_script, 'lxml')
print('\n*******************"'+ catStr.upper() + '"*******************\n')
catNumb += 1
for item in bSoup.find_all('div', class_='inner-article'):
url = item.a['href']
alt = item.find('img')['alt']
req = requests.get('http://www.supremenewyork.com' + url)
item_soup = BeautifulSoup(req.text, 'lxml')
name = item_soup.find('h1', itemprop='name').text
style = item_soup.find('p', itemprop='model').text
print alt +(' --- ')+ name +(' --- ')+ style
altArray.append(alt)
nameArray.append(name)
styleArray.append(style)
print altArray
print nameArray
print styleArray
I am getting this error when I execute the script
AttributeError: 'str' object has no attribute 'text' with the error pointing towards the
proxy_script = requests.get((cUrl.text), proxies=proxies)
i recently added this to the script which sorta fixed it... It was able to print the category's but no info between them. Which (I NEED) it just printed ****************jackets**************, ****shirts******, etc.... here is what I changed
import requests
from bs4 import BeautifulSoup
# make sure proxy is http and port 8080
UK_Proxy1 = raw_input('UK http Proxy1: ')
UK_Proxy2 = raw_input('UK http Proxy2: ')
proxies = {
'http': 'http://' + UK_Proxy1 + '',
'https': 'http://' + UK_Proxy2 + '',
}
categorys = ['jackets','shirts','tops_sweaters','sweatshirts','pants','shorts','t-shirts','hats','bags','accessories','shoes','skate']
catNumb = 0
altArray = []
nameArray = []
styleArray = []
for cat in categorys:
catStr = str(categorys[catNumb])
cUrl = 'http://www.supremenewyork.com/shop/all/' + catStr
proxy_script = requests.get(cUrl, proxies=proxies).text
bSoup = BeautifulSoup(proxy_script, 'lxml')
print('\n*******************"'+ catStr.upper() + '"*******************\n')
catNumb += 1
for item in bSoup.find_all('div', class_='inner-article'):
url = item.a['href']
alt = item.find('img')['alt']
req = requests.get('http://www.supremenewyork.com' + url)
item_soup = BeautifulSoup(req.text, 'lxml')
name = item_soup.find('h1', itemprop='name').text
style = item_soup.find('p', itemprop='model').text
print alt +(' --- ')+ name +(' --- ')+ style
altArray.append(alt)
nameArray.append(name)
styleArray.append(style)
print altArray
print nameArray
print styleArray
I put .text at the end and it worked sorta.... How do i fix it so it prints the info I want???
I think you miss smt. Your cUrl is a string type, not request type. I guess you want:
proxy_script = requests.get(cUrl, proxies=proxies).text
I am trying to learn how to automatically fetch urls from a page. In the following code I am trying to get the port co-ordinates of the webpage from different links:
import urllib.request
import re
a = input("What country is your port in?: ")
b = input("What is the name of the port?: ")
url = "http://ports.com/"
totalurl = "http://ports.com/" + a + "/" + b + "/"
htmlfile = request.urlopen(url)
htmltext = htmlfile.read()
regex = '<span class="small'+ a + "/" + b + "/" '">...</span>'
pattern = re.compile(regex)
with urllib.request.urlopen(url) as response:
html = response.read().decode()
num = re.findall(pattern, html)
print(num)
This is the error message I receive:
What country is your port in?: greece
What is the name of the port?: port-of-eleusis
Traceback (most recent call last):
File "/Users/kieronblemings/Desktop/PROGRAMS PYTHON/ports extraction.py", line 13, in <module>
htmlfile = request.urlopen(url)
NameError: name 'request' is not defined
I'm trying to extract information from a very large XML file by parsing it with a python(v2.7.10) script. The goal is to make this information available to my other project written in C as (very large) array and integer literals in a file.
The following code demonstrates what I'm doing:
import sys
myList = [[1,2,3],[11,22,33]]
size = len(myList)
print("int myList[" + str(size) + "][3] = " + str(myList) + ";\n")
The result of this is int myList[2][3] = [[1,2,3],[11,22,33]];, but what I need is C syntax: int myList[2][3] = {{1,2,3},{11,22,33}};
Is there a way to modify str() in a way that it uses {} instead of [] for printing lists?
You can use a translate table trans for that, and then use the str.translate(..) function:
from string import maketrans
trans = maketrans('[]','{}')
print("int myList[" + str(size) + "][3] = " + str(myList).translate(trans) + ";\n")
This then prints:
>>> from string import maketrans
>>> myList = [[1,2,3],[11,22,33]]
>>> size = len(myList)
>>> trans = maketrans('[]','{}')
>>> print("int myList[" + str(size) + "][3] = " + str(myList).translate(trans) + ";\n")
int myList[2][3] = {{1, 2, 3}, {11, 22, 33}};
But note that if the elements are not lists, this can result in content that is not semantically valid. We simply replace '[' by '{', and ']' by '}'.
I need to remove the [',\n\xa0 characters and the years (1994) from this entry in the list then iterate over the list doing this to each entry.
is there a way I can do this? I'm newish to python and have been trying for hours
The entries are like so :
[['The Shawshank Redemption\n(1994)\n\n\n 9.2\xa0\xa0\n\n'], ['The Godfather\n(1972)\n\n\n 9.2\xa0\xa0\n\n'], ['The Godfather: Part II\n(1974)\n\n\n 9.0\xa0\xa0\n\n'],
edit: sorry for not including the code, iv managed to strip the numbers and the \n newline characters after the year. but still getting newlines character just after the film title. ill paste my code anwyway thanks!:
from bs4 import BeautifulSoup
import requests
import random
names = []
newList = []
url = 'http://m.imdb.com/chart/top'
# get contents from url
content = requests.get(url).content
# get soup
soup = BeautifulSoup(content,'lxml') # choose lxml parser
# find all the references
ref_tags = soup.findAll('span', { 'class' : 'media-body' })
realTags = soup.find_all("h4")
# iterate through the ResultSet
for i,ref_tag in enumerate(ref_tags):
# print text only
names.append('[{0}] {1}'.format(i,ref_tag.text))
pos = 0
for name in names:
newName = names[pos]
newName = newName[9:]
newName = newName[:100]
newName = newName.split("(")
newName = newName[::2]
del newName[2:9:3]
newList.append(newName)
pos = pos + 1
print(newList)
choice = random.choice(newList)
print(choice)
the output is like this:
[['The Shawshank Redemption\n'], ['The Godfather\n'], ['The Godfather: Part II\n'], ['The Dark Knight\n'], ['12 Angry Men\n']
so i got it to output how i would like. thanks any!
heres the code for anyone who may need it in future:
from bs4 import BeautifulSoup
import requests
import random
names = []
newList = []
url = 'http://m.imdb.com/chart/top'
# get contents from url
content = requests.get(url).content
# get soup
soup = BeautifulSoup(content,'lxml') # choose lxml parser
# find all the references
ref_tags = soup.findAll('span', { 'class' : 'media-body' })
realTags = soup.find_all("h4")
# iterate through the ResultSet
for i,ref_tag in enumerate(ref_tags):
# print text only
names.append('[{0}] {1}'.format(i,ref_tag.text))
pos = 0
for name in names:
newName = names[pos]
newName = newName[9:]
newName = newName[:100]
newName = newName.split("(")
newName = newName[::2]
del newName[2:9:3]
newList.append(newName)
pos = pos + 1
wordChoice = random.choice(newList)
str = str(wordChoice)
editWord = str.split("\\n")
print(editWord[1])
and the output is like so:
Shutter Island