python tor stem HTTP error 503 - python-3.x

I'm currently trying to get a new ip via python.
show source :
import urllib.request
from stem import Signal
from stem.control import Controller
import socks, socket, time, random
proxy_support = urllib.request.ProxyHandler({"http" : "127.0.0.1:8118"})
opener = urllib.request.build_opener(proxy_support)
UA = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'
]
def newI():
controller = Controller.from_port(port = 9051)
try:
controller.authenticate()
controller.signal(Signal.NEWNYM)
bytes_read = controller.get_info("traffic/read")
bytes_written = controller.get_info("traffic/written")
print (bytes_read)
print (bytes_written)
finally:
controller.close()
if __name__ == '__main__':
params = 'site:google.com admin'
page = 0
for i in range(100):
url = 'http://www.google.co.kr/search?hl=ko&q=%s&start=%d' %(urllib.parse.quote(params), page)
proxy_support = urllib.request.ProxyHandler({"http" : "127.0.0.1:8118"})
urllib.request.install_opener(opener)
user_agent = random.choice(UA)
headers = {'User-Agent' : user_agent}
random_interval = random.randrange(1, 5, 1)
time.sleep(random_interval)
req = urllib.request.Request(url, headers = headers)
res = urllib.request.urlopen(req)
html = res.read()
print (len(html))
page = page + 10
newI()
I have my vidalia running and privoxy. I have my settings correctly set:
Web Proxy (HTTP): 127.0.0.1:8118 and the same for HTTPS
In my privoxy config file I have this line:
forward-socks5 / 127.0.0.1:9050 .
Though still when I run the code it is stuck on on case 1 and I can't get an ip. This is the log of my vidalia:
1. settings > Sharing > Run as client only
2. settings > Advanced > 127.0.0.1 : 9051
Though still when I run the code it is stuck on on case 1 and I can't get an ip. This is the log of my vidalia:
Traceback (most recent call last):
File "C:/Users/kwon/PycharmProjects/google_search/test.py", line 50, in <module>
res = urllib.request.urlopen(req)
File "C:\Python33\lib\urllib\request.py", line 156, in urlopen
return opener.open(url, data, timeout)
File "C:\Python33\lib\urllib\request.py", line 475, in open
response = meth(req, response)
File "C:\Python33\lib\urllib\request.py", line 587, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python33\lib\urllib\request.py", line 507, in error
result = self._call_chain(*args)
File "C:\Python33\lib\urllib\request.py", line 447, in _call_chain
result = func(*args)
File "C:\Python33\lib\urllib\request.py", line 692, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python33\lib\urllib\request.py", line 475, in open
response = meth(req, response)
File "C:\Python33\lib\urllib\request.py", line 587, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python33\lib\urllib\request.py", line 513, in error
return self._call_chain(*args)
File "C:\Python33\lib\urllib\request.py", line 447, in _call_chain
result = func(*args)
File "C:\Python33\lib\urllib\request.py", line 595, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 503: Service Unavailable
What am I doing wrong ?

Google prevent automated requests, take a look to this another post Tor blocked by Google

Related

Python requests code worked yesterday but now returns TooManyRedirects: Exceeded 30 redirects

I am trying to get the data from a site using requests using this simple code (running on Google Colab):
import requests, json
def GetAllStocks():
url = 'https://iboard.ssi.com.vn/dchart/api/1.1/defaultAllStocks'
res = requests.get(url)
return json.loads(res.text)
This worked well until this morning and I could not figure out why it is returning "TooManyRedirects: Exceeded 30 redirects." error now.
I can still get the data just by browsing the url directly from Google Chrome in Incognito mode so I donot think this is because of the Cookies. I tried passing the whole headers but still it does not work. I tried passing 'allow_redirects=False' and the returned status_code is 302.
I am not sure if there is anything I could try as this is so strange to me.
Any guidance is much appreciated. Thank you very much!
You need to send user-agent header to mimic a regular browser behaviour.
import requests, json, random
def GetAllStocks():
user_agents = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:101.0) Gecko/20100101 Firefox/101.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:77.0) Gecko/20100101 Firefox/77.0",
]
headers = {
"User-Agent": random.choice(user_agents),
"Accept": "application/json",
}
url = "https://iboard.ssi.com.vn/dchart/api/1.1/defaultAllStocks"
res = requests.get(url, headers=headers)
return json.loads(res.text)
data = GetAllStocks()
print(data)

Using WebSocket for Web Page Data Scraping

I want to scrape some of the data from here which is implemented based on websockets. So after inspecting the Chrome DevTools for wss address and header:
and the negotiation message:
I wrote:
from websocket import create_connection
headers = {
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,fa;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'Upgrade',
'Host': 'stream179.forexpros.com',
'Origin': 'https://www.investing.com',
'Pragma': 'no-cache',
'Sec-WebSocket-Extensions': 'client_max_window_bits',
'Sec-WebSocket-Key': 'ldcvnZNquzPkSNvpSdI09g==',
'Sec-WebSocket-Version': '13',
'Upgrade': 'websocket',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
ws = create_connection('wss://stream179.forexpros.com/echo/894/l27e2ja8/websocket', header=headers)
nego_message = '''["{\"_event\":\"bulk-subscribe\",\"tzID\":8,\"message\":\"pid-1:%%pid-8839:%%pid-166:%%pid-20:%%pid-169:%%pid-170:%%pid-44336:%%pid-27:%%pid-172:%%pid-2:%%pid-3:%%pid-5:%%pid-7:%%pid-9:%%pid-10:%%pid-945629:%%pid-11:%%pid-16:%%pid-68:%%pidTechSumm-1:%%pidTechSumm-2:%%pidTechSumm-3:%%pidTechSumm-5:%%pidTechSumm-7:%%pidTechSumm-9:%%pidTechSumm-10:%%pidExt-1:%%event-393634:%%event-393633:%%event-393636:%%event-393638:%%event-394479:%%event-394518:%%event-394514:%%event-394516:%%event-394515:%%event-394517:%%event-393654:%%event-394467:%%event-393653:%%event-394468:%%event-394545:%%event-394549:%%event-394548:%%event-394547:%%event-394550:%%event-394546:%%event-394551:%%event-394553:%%event-394552:%%event-394743:%%event-394744:%%event-393661:%%event-394469:%%event-394470:%%event-393680:%%event-393682:%%event-393681:%%event-393687:%%event-393694:%%event-393685:%%event-393689:%%event-393688:%%event-393695:%%event-393698:%%event-393704:%%event-393705:%%event-393724:%%event-393723:%%event-393725:%%event-393726:%%event-394591:%%event-393736:%%event-393733:%%event-393734:%%event-393740:%%event-393731:%%event-393732:%%event-393730:%%event-394617:%%event-394616:%%event-393737:%%event-378304:%%event-393645:%%event-394619:%%event-393755:%%event-393757:%%event-393760:%%event-393756:%%event-393758:%%event-393759:%%event-393761:%%event-393762:%%event-394481:%%event-394625:%%event-393754:%%event-394483:%%event-393775:%%event-394621:%%event-394622:%%event-376710:%%event-394623:%%event-394484:%%event-394624:%%isOpenExch-1:%%isOpenExch-2:%%isOpenExch-13:%%isOpenExch-3:%%isOpenExch-4:%%isOpenPair-1:%%isOpenPair-8839:%%isOpenPair-44336:%%cmt-1-5-1:%%domain-1:\"}"]'''
ws.send(nego_message)
while True:
print(ws.recv())
but I'm getting:
o
Traceback (most recent call last):
File "test.py", line 647, in <module>
print(ws.recv())
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 313, in recv
opcode, data = self.recv_data()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 330, in recv_data
opcode, frame = self.recv_data_frame(control_frame)
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 343, in recv_data_frame
frame = self.recv_frame()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 377, in recv_frame
return self.frame_buffer.recv_frame()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_abnf.py", line 361, in recv_frame
self.recv_header()
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_abnf.py", line 309, in recv_header
header = self.recv_strict(2)
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_abnf.py", line 396, in recv_strict
bytes_ = self.recv(min(16384, shortage))
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_core.py", line 452, in _recv
return recv(self.sock, bufsize)
File "C:\Users\me\AppData\Local\Programs\Python\Python37\lib\site-packages\websocket\_socket.py", line 115, in recv
"Connection is already closed.")
websocket._exceptions.WebSocketConnectionClosedException: Connection is already closed.
[Finished in 1.9s]
What am I missing here?
Update 1: updating code using WebSocketApp:
def on_message(ws, message):
print("message:", message)
def on_error(ws, error):
print("error:", error)
def on_close(ws):
print("closed.")
def on_open(ws):
print("opened")
time.sleep(1)
ws.send(nego_message)
ws = websocket.WebSocketApp(
"wss://stream179.forexpros.com/echo/894/l27e2ja8/websocket",
on_open = on_open,
on_message = on_message,
on_error = on_error,
on_close = on_close,
header = headers
)
websocket.enableTrace(True)
ws.run_forever()
but still no success:
--- request header ---
GET /echo/894/l27e2ja8/websocket HTTP/1.1
Upgrade: websocket
Connection: Upgrade
Host: stream179.forexpros.com
Origin: http://stream179.forexpros.com
Accept-Encoding: gzip, deflate, br
Accept-Language: en-US,en;q=0.9,fa;q=0.8
Cache-Control: no-cache
Connection: Upgrade
Host: stream179.forexpros.com
Origin: https://www.investing.com
Pragma: no-cache
Sec-WebSocket-Extensions: client_max_window_bits
Sec-WebSocket-Key: ldcvnZNquzPkSNvpSdI09g==
Sec-WebSocket-Version: 13
Upgrade: websocket
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36
-----------------------
--- response header ---
HTTP/1.1 101 Switching Protocols
Upgrade: websocket
Connection: Upgrade
Sec-WebSocket-Accept: XPKKpUMZLpSYx/1z8Q0499hcobs=
-----------------------
opened
send: b'\x81\xfe\x06{_\xda7\xd2\x04\xf8L\xf0\x00\xbfA\xb71\xae\x15\xe8}\xb8B\xbe4\xf7D\xa7=\xa9T\xa06\xb8R\xf0s\xf8C\xa8\x16\x9e\x15\xe8g\xf6\x15\xbf:\xa9D\xb38\xbf\x15\xe8}\xaa^\xb6r\xeb\r\xf7z\xaa^\xb6r\xe2\x0f\xe1f\xe0\x12\xf7/\xb3S\xffn\xec\x01\xe8z\xffG\xbb;\xf7\x05\xe2e\xff\x12\xa26\xbe\x1a\xe3i\xe3\r\xf7z\xaa^\xb6r\xeb\x00\xe2e\xff\x12\xa26\xbe\x1a\xe6k\xe9\x04\xe4e\xff\x12\xa26\xbe\x1a\xe0h\xe0\x12\xf7/\xb3S\xffn\xed\x05\xe8z\xffG\xbb;\xf7\x05\xe8z\xffG\xbb;\xf7\x04\xe8z\xffG\xbb;\xf7\x02\xe8z\xffG\xbb;\xf7\x00\xe8z\xffG\xbb;\xf7\x0e\xe8z\xffG\xbb;\xf7\x06\xe2e\xff\x12\xa26\xbe\x1a\xebk\xef\x01\xe0f\xe0\x12\xf7/\xb3S\xffn\xeb\r\xf7z\xaa^\xb6r\xeb\x01\xe8z\xffG\xbb;\xf7\x01\xeae\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe3e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe0e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe1e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe7e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe5e\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xebe\xff\x12\xa26\xbec\xb7<\xb2d\xa72\xb7\x1a\xe3o\xe0\x12\xf7/\xb3S\x97\'\xae\x1a\xe3e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe9\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4l\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xe1i\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xec\x04\xeae\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6k\xed\x0e\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7n\xe2\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe3k\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xef\x06\xe4e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xeb\x02\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7n\xed\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xe7k\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xee\x01\xe5e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xef\x04\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe6i\xe2\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe6j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xef\x03\xebe\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xee\x0f\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7k\xed\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe7o\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xef\x03\xe4e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xef\x06\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe7j\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x02\xe7m\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xed\x03\xe1e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6h\xee\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4i\xeb\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x03\xe4f\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xee\x00\xe2e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe2\x07\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4g\xe8\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xean\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xec\x0f\xe5e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe3\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4g\xef\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x01\xeaf\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xec\x0f\xeae\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xe3\x02\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe4f\xe2\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe2k\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x07\xe7e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xe8\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5m\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe0j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x05\xe4e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6j\xe3\x06\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5l\xec\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe1l\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x04\xe6e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xee\x07\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5l\xeb\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe1m\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x04\xe2e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6i\xeb\x00\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe4n\xec\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe1h\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xe5g\xe9\x07\xe6e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1i\xee\x02\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe4n\xe3\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe7j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x02\xe5e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xec\x07\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5j\xec\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe7g\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebl\xed\x02\xebe\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xec\x06\xe8z\xffR\xa4:\xb4C\xffl\xe3\x04\xe5i\xe8\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x03\xean\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xec\x05\xe7e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe1h\xef\x03\xe8z\xffR\xa4:\xb4C\xffl\xe3\x03\xe6g\xe9\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xe9\x00\xe5j\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xec\x05\xe3e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6i\xe8\x05\xe8z\xffR\xa4:\xb4C\xffl\xed\x01\xe5n\xea\r\xf7z\xbfA\xb71\xae\x1a\xe1f\xee\x01\xe0l\xe0\x12\xf7:\xacR\xbc+\xf7\x04\xebk\xee\x0f\xe6e\xff\x12\xb7)\xbfY\xa6r\xe9\x0e\xe6i\xe8\x03\xe8z\xff^\xa1\x10\xaaR\xbc\x1a\xa2T\xbar\xeb\r\xf7z\xb3D\x9d/\xbfY\x97\'\xb9_\xffm\xe0\x12\xf76\xa9x\xa2:\xb4r\xaa<\xb2\x1a\xe3l\xe0\x12\xf76\xa9x\xa2:\xb4r\xaa<\xb2\x1a\xe1e\xff\x12\xbb,\x95G\xb71\x9fO\xb17\xf7\x03\xe8z\xff^\xa1\x10\xaaR\xbc\x0f\xbb^\xa0r\xeb\r\xf7z\xb3D\x9d/\xbfY\x82>\xb3E\xffg\xe2\x04\xebe\xff\x12\xbb,\x95G\xb71\x8aV\xbb-\xf7\x03\xe6l\xe9\x01\xe8z\xffT\xbf+\xf7\x06\xffj\xf7\x06\xe8z\xffS\xbd2\xbb^\xbcr\xeb\r\xf0"\xf8j'
message: o
send: b'\x88\x82!\xdd\x07\xcf"5'
closed.
[Finished in 2.3s]
I tried to remove all the dashes from the message sent and eventually it worked.
nego_message = '{"_event":"bulk-subscribe","tzID":8,"message":"pid-0:%%isOpenExch-1:%%pid-8849:%%isOpenExch-1004:%%pid-8833:%%pid-8862:%%pid-8830:%%pid-8836:%%pid-8831:%%pid-8916:%%pid-8832:%%pid-169:%%pid-20:%%isOpenExch-2:%%pid-166:%%pid-172:%%isOpenExch-4:%%pid-27:%%isOpenExch-3:%%pid-167:%%isOpenExch-9:%%pid-178:%%isOpenExch-20:%%pid-6408:%%pid-6369:%%pid-13994:%%pid-6435:%%pid-13063:%%pid-26490:%%pid-243:%%pid-1:%%isOpenExch-1002:%%pid-2:%%pid-3:%%pid-5:%%pid-7:%%pid-9:%%pid-10:%%pid-23705:%%pid-23706:%%pid-23703:%%pid-23698:%%pid-8880:%%isOpenExch-118:%%pid-8895:%%pid-1141794:%%pid-1175152:%%isOpenExch-152:%%pid-1175153:%%pid-14958:%%pid-44336:%%isOpenExch-97:%%pid-8827:%%pid-6497:%%pid-941155:%%pid-104395:%%pid-1013048:%%pid-1055979:%%pid-1177973:%%pid-1142416:%%pidExt-1:%%cmt-1-5-1:%%pid-252:%%pid-1031244:%%isOpenExch-125:"}'
ws.send(nego_message)
while True:
print(ws.recv())
Outputs:
a["{\"message\":\"pid-3::{\\\"pid\\\":\\\"3\\\",\\\"last_dir\\\":\\\"greenBg\\\",\\\"last_numeric\\\":149.19,\\\"last\\\":\\\"149.19\\\",\\\"bid\\\":\\\"149.18\\\",\\\"ask\\\":\\\"149.19\\\",\\\"high\\\":\\\"149.29\\\",\\\"low\\\":\\\"149.12\\\",\\\"last_close\\\":\\\"149.26\\\",\\\"pc\\\":\\\"-0.07\\\",\\\"pcp\\\":\\\"-0.05%\\\",\\\"pc_col\\\":\\\"redFont\\\",\\\"turnover\\\":\\\"18.13K\\\",\\\"turnover_numeric\\\":\\\"18126\\\",\\\"time\\\":\\\"0:39:09\\\",\\\"timestamp\\\":1666139948}\"}"]
The while loop is calling ws.recv() twice. If you simply do:
print(ws.recv())
It will not attempt to call .recv() on a closed connection. The result of your message output is printing o before the stack trace.
As an aside, it seems like you might want a longer running connection using websocket.WebSocketApp (example) for a scrape.

Expand short urls in python using requests library

I have a large number of short URLs and I want to expand them. I found somewhere online (I missed the source) the following code:
short_url = "t.co/NHBbLlfCaa"
r = requests.get(short_url)
if r.status_code == 200:
print("Actual url:%s" % r.url)
It works perfectly. But I get this error when I ping the same server for many times:
urllib3.exceptions.MaxRetryError:
HTTPConnectionPool(host='www.fatlossadvice.pw', port=80): Max retries
exceeded with url:
/TIPS/KILLED-THAT-TREADMILL-WORKOUT-WORD-TO-TIMMY-GACQUIN.ASP (Caused
by NewConnectionError(': Failed to establish a new connection: [Errno
11004] getaddrinfo failed',))
I tried many solutions like the set here: Max retries exceeded with URL in requests, but nothing worked.
I was thinking about another solution, which is to pass an useragent in the request, and each time I change it randomly (using a large number of useragents):
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
]
r = requests.get(short_url, headers={'User-Agent': user_agent_list[np.random.randint(0, len(user_agent_list))]})
if r.status_code == 200:
print("Actual url:%s" % r.url)
My problem is that r.url always return the short url instead of the long one (the expanded one).
What am I missing?
You can prevent the error by adding allow_redirects=False to requests.get() method to prevent redirecting to page that doesn't exist (and thus raising the error). You have to examine the header sent by server yourself (replace XXXX by https, remove spaces):
import requests
short_url = ["XXXX t.co /namDL4YHYu",
'XXXX t.co /MjvmV',
'XXXX t.co /JSjtxfaxRJ',
'XXXX t.co /xxGSANSE8K',
'XXXX t.co /ZRhf5gWNQg']
for url in short_url:
r = requests.get(url, allow_redirects=False)
try:
print(url, r.headers['location'])
except KeyError:
print(url, "Page doesn't exist!")
Prints:
XXXX t.co/namDL4YHYu http://gottimechillinaround.tumblr.com/post/133931725110/tip-672
XXXX t.co/MjvmV Page doesn't exist!
XXXX t.co/JSjtxfaxRJ http://www.youtube.com/watch?v=rE693eNyyss
XXXX t.co/xxGSANSE8K http://www.losefattips.pw/Tips/My-stretch-before-and-after-my-workout-is-just-as-important-to-me-as-my-workout.asp
XXXX .co/ZRhf5gWNQg http://www.youtube.com/watch?v=3OK1P9GzDPM

Login using python requests doesn't work for pythonanywhere.com

I am trying login to the site pythonanywhere.com
import requests
url='https://www.pythonanywhere.com/login'
s = requests.session()
values = {
'auth-username': 'username',
'auth-password': 'password'}
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
u = s.post(url, data=values, headers=headers)
But I am getting a <Response [403]> , Csrf verification failed. How do I login to that site?
You need to get page first.So you can get the crsftoken and sessionid.And remember to set Referer=https://www.pythonanywhere.com/login/
import requests
url='https://www.pythonanywhere.com/login'
s = requests.session()
s.get(url)
values = {
'auth-username': 'username',
'auth-password': 'password',
"csrfmiddlewaretoken" : s.cookies.get("csrftoken"),
"login_view-current_step" : "auth"
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Referer': 'https://www.pythonanywhere.com/login/'}
u = s.post(url, data=values, headers=headers)
print(u.content)

ModuleNotFoundError: No module named 'urllib2'

So I have import urllib2 in Python 3.2, using komodo edit (not sure if that matters)
Using it in these spots
opener = urllib2.build_opener()
...
request = urllib2.Request(src, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'})
srclen = float(urllib2.urlopen(request).info().get('Content-Length'))
...
request = urllib2.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'})
srclen = float(urllib2.urlopen(request).info().get('Content-Length'))
...
request = urllib2.Request(url, headers=url_headers)
response = urllib2.urlopen(request)
...
request = urllib2.Request(RPTURL + rpt + '.xxx', headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'})
srclen = float(urllib2.urlopen(request).info().get('Content-Length'))
except urllib2.HTTPError:
I get the error ModuleNotFoundError: No module named 'urllib2'
How can I correct this?

Resources