Language Ver: Python 3.6.3
IDE Ver: PyCharm 2017.2.3
I was trying to parse a weather website to print weather for a place. As I am learning Python, previously I used urllib.request.urlopen(url).read() and it worked. Now, I am modifying the code to BeautifulSoup4 and requests module. Below is my code:
from bs4 import *
import requests
url = "https://www.accuweather.com/en/in/dhenkanal/189844/weather-forecast/189844"
data = requests.get(url)
soup = BeautifulSoup(data.text, "html.parser")
print(soup.find('div', {'class': 'info'}))
But each time I try to run the code it gives me following error:
Traceback (most recent call last):
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
chunked=chunked)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "", line 2, in raise_from
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 1009, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 871, in read
return self._sslobj.read(len, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 631, in read
v = self._sslobj.read(len, buffer)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.py", line 440, in send
timeout=timeout
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\retry.py", line 357, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\packages\six.py", line 685, in reraise
raise value.with_traceback(tb)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
chunked=chunked)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "", line 2, in raise_from
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 1009, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 871, in read
return self._sslobj.read(len, buffer)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\ssl.py", line 631, in read
v = self._sslobj.read(len, buffer)
urllib3.exceptions.ProtocolError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:/Projects/Python/Practice/Practice1.py", line 5, in
data = requests.get(url)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.py", line 618, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Nrusingh\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.py", line 490, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
Process finished with exit code 1
What is this error and how to correct it? And why it worked in urllib, but not in requests?
I used your code straight up and I got the same error then I followed how the requests are sent in browser. Some servers don't respond if expected headers are not sent with request that they use as part of backend processing. Turns out the server was looking for a header called user-agent usually used to determine what client the request is from. Now, amended code below which works!
from bs4 import *
import requests
url = "https://www.accuweather.com/en/in/dhenkanal/189844/weather-forecast/189844"
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, "html.parser")
Now you can play with your soup!
You can in fact pass more headers like accept, dnt, pragma, accept-language, cache-control etc. Explanation of these http headers are for another question, another time. Hope it helps :)
Try increasing the timeout parameter of your requests.get method :
requests.get(url, headers=headers, timeout=5)
But if your script is being blocked by the server to prevent scrapping attempts . If this is the case you can try faking a web browser by setting appropriate headers .
{"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", "Referer": "http://example.com"}
your final code
import requests
url = "https://www.accuweather.com/en/in/dhenkanal/189844/weather-forecast/189844"
headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", "Referer": "http://example.com"}
data = requests.get(url,headers=headers,timeout=5)
Related
Python requests module to fetch json from a website works fine on my Windows laptop, but just refuses to run on my Ubuntu 20.04 VPS.
import requests
import json
import warnings
warnings.filterwarnings("ignore")
url = 'https://www1.nseindia.com/live_market/dynaContent/live_watch/stock_watch/niftyStockWatch.json'
headers = {'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8,ml;q=0.7', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'}
r = requests.get(url, headers=headers)
response = r.json()
print(response)
On Windows it executes in like a second, but on the Ubuntu VPS the code just stays on for like about 10-15 minutes and then throws the following error:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.8/http/client.py", line 1332, in getresponse
response.begin()
File "/usr/lib/python3.8/http/client.py", line 303, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.8/http/client.py", line 264, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 719, in urlopen
retries = retries.increment(
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/lib/python3/dist-packages/six.py", line 702, in reraise
raise value.with_traceback(tb)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 665, in urlopen
httplib_response = self._make_request(
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.8/http/client.py", line 1332, in getresponse
response.begin()
File "/usr/lib/python3.8/http/client.py", line 303, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.8/http/client.py", line 264, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "TEST_requests.py", line 10, in <module>
r = requests.get(url, headers=headers, stream=True)
File "/usr/local/lib/python3.8/dist-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.8/dist-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
Any ideas what might be the issue?
I'm trying to detect urls of a website's listing page but BeautifulSoup cannot do that. I get the following exception, even if I try with the header,
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 1321, in getresponse
response.begin()
File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 296, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py", line 257, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py", line 589, in readinto
return self._sock.recv_into(b)
TimeoutError: [Errno 60] Operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "/usr/local/lib/python3.7/site-packages/urllib3/util/retry.py", line 368, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.7/site-packages/urllib3/packages/six.py", line 686, in reraise
raise value
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 386, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/usr/local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 317, in _raise_timeout
raise ReadTimeoutError(self, url, "Read timed out. (read timeout=%s)" % timeout_value)
urllib3.exceptions.ReadTimeoutError: HTTPConnectionPool(host='www.sahibinden.com', port=80): Read timed out. (read timeout=None)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/soner/PycharmProjects/bitirme2/main.py", line 8, in <module>
r = requests.get(url)
File "/usr/local/lib/python3.7/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/local/lib/python3.7/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python3.7/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.7/site-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.7/site-packages/requests/adapters.py", line 529, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPConnectionPool(host='www.sahibinden.com', port=80): Read timed out. (read timeout=None)
Process finished with exit code 1
But when I try the url in the code with https://hackertarget.com/extract-links/, it brings URLs.
import requests
from bs4 import BeautifulSoup
url = 'http://www.sahibinden.com/satilik/istanbul-kartal?pagingOffset=50&pagingSize=50'
url2 = 'http://www.stackoverflow.com'
r = requests.get(url)
html_content = r.text
soup = BeautifulSoup(html_content, 'lxml')
for link in soup.find_all("a", {"class": "classifiedTitle"}):
print(link.get('href'))
'''
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
print(requests.get(url, headers=headers, timeout=5).text)
'''
As a note, if you see yourself blocked from the website(sahibinden), it is possible. I haven't researched the usage of BeautifulSoup with a proxy list.
This is the code snippet I run and it worked as expected:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
url = 'http://www.sahibinden.com/satilik/istanbul-kartal?pagingOffset=50&pagingSize=50'
r = requests.get(url, headers=headers)
if r.ok:
soup = BeautifulSoup(r.text, 'lxml')
for a in soup('a', 'classifiedTitle'):
print(a.get('href'))
And here is the ouput of the code above:
/ilan/emlak-konut-satilik-directten%2Ccift-wc-li%2Cgenis-m2de%2Ciskanli%2Culasimi-kolay-sik-3-plus1-671049902/detay
/ilan/emlak-konut-satilik-nesrin-den-kartal-ugurmumcuda-satilik-3-plus1-yunus-emre-caddesinde-692133846/detay
/ilan/emlak-konut-satilik-akelden-karliktepe-de-genis-m2-li-krediye-uygun-daire-659458837/detay
/ilan/emlak-konut-satilik-ikea-ve-metro-yani-teknik-yapi-uprise-elite_mukemmel-firsat-3-plus1-692131163/detay
/ilan/emlak-konut-satilik-kartal-atalar-da-iskanli-5-plus1-dubleks-satilik-daire-692125302/detay
/ilan/emlak-konut-satilik-satilik-daire-kartal-atalar-da-2-plus1-lux-100-m2-671083034/detay
/ilan/emlak-konut-satilik-kartal-ugurmumcuda-3-plus1-genis-masrafsiz-satilik-daire-681180607/detay
/ilan/emlak-konut-satilik-soner-den-manzara-adalar-da-satilik-kacirilmayacak-kelepir-daire-653973723/detay
/ilan/emlak-konut-satilik-mertcan-dan-tarihi-ayazma-caddesinde-2-plus1-satilik-ters-dubleks-692122837/detay
/ilan/emlak-konut-satilik-cinar-emlak%2Ctan-hurriyet-mah-105-m2-toprak-tapulu-692117031/detay
/ilan/emlak-konut-satilik-kartal-cumhuriyet-te-arsa-hisseli-yuksek-giris-daire-692116930/detay
/ilan/emlak-konut-satilik-temiz-emlaktan-petroliste-2-plus1-satilik-sifir-deniz-manzarali-671086029/detay
/ilan/emlak-konut-satilik-cemal-yalcin-dan-ozel-mimarili-luks-satilik-dubleks-623158476/detay
/ilan/emlak-konut-satilik-la-marin-kartal-da-site-icerisinde-ozel-bahce-kati-sifir-daire-645480180/detay
/ilan/emlak-konut-satilik-sen-kardeslerden-merkezde-3-plus1%2Ccok-temiz-satilik-daire%2C350.000tl-692103788/detay
/ilan/emlak-konut-satilik-kartal-petrol-is-mah-de-3-plus1-deniz-manzarali-yatirimlik-daire-619762304/detay
/ilan/emlak-konut-satilik-remax-red-rukiye-korkmaz-dan-panorama-velpark-ta-esyali-1-plus1-616596826/detay
/ilan/emlak-konut-satilik-yakacik-demirli-twinstar-sitesi-ultra-luks-174-m2-3-plus1-daire-692104680/detay
/ilan/emlak-konut-satilik-kartal-soganlikta-yatirimlik-kiracili-firsat-2-plus1-daire-682793715/detay
/ilan/emlak-konut-satilik-istmarinada-devirli-taksitli-satilik-studyo-gulsen-yanmazdan-638548163/detay
/ilan/emlak-konut-satilik-sahibinden-satilik-kartal-merkezde-kaymakamligin-karsisinda-2-plus1-692054497/detay
/ilan/emlak-konut-satilik-petrolis______ara-kat-2-plus1-110-m2-lux-panjurlu_____carsiya-yakin-692100683/detay
/ilan/emlak-konut-satilik-ful-deniz-manzarali-3-plus1-ana-yola-cok-yakin-115m2-sifir-daire-585807696/detay
/ilan/emlak-konut-satilik-kartal-karlitepe-de-ters-dublek-2-plus2-satilik-daire-692085141/detay
/ilan/emlak-konut-satilik-kartal-dap-yapi-istmarina-full-deniz-manzarali-2-plus1-satilik-621795699/detay
/ilan/emlak-konut-satilik-aybars-dan-site-icinde-havuzlu-satilik-daire-671063936/detay
/ilan/emlak-konut-satilik-soganlik-yeni-mah-5-yillik-binada-adalar-manzarali-satilik-dair-679308838/detay
/ilan/emlak-konut-satilik-kartal-soganlik-orta-mah-e-5-yani-yeni-bina-kelepir-daire-573785719/detay
/ilan/emlak-konut-satilik-sahibinden-site-icerisinde-1-plus1-644746509/detay
/ilan/emlak-konut-satilik-3-plus1-luks-sitede-646420303/detay
/ilan/emlak-konut-satilik-mirac-dan-ayazma-koru-da-lux-yapili-3-plus1-135m2-masrafsiz-daire-535382195/detay
/ilan/emlak-konut-satilik-sahibinden-site-icerisinde-3-plus1-644729603/detay
/ilan/emlak-konut-satilik-cevizli-de-satilik-daire-2-plus1-lux-85-m2-671030197/detay
/ilan/emlak-konut-satilik-esentepe-de-bahceli-acik-otoparkli-125m2-ferah-kullansli-daire-670847710/detay
/ilan/emlak-konut-satilik-atalarda-ara-katta-sifir-binada-2-plus1-85-m2-otoparkli-510436215/detay
/ilan/emlak-konut-satilik-sahil-mesa-marmara-10.kat-122m2-deniz-manzarali-0-satilik-3-plus1-692085951/detay
/ilan/emlak-konut-satilik-kartal-da-sifir-ara-kat-3-plus1-satilik-daire-692090351/detay
/ilan/emlak-konut-satilik-pega-kartal-satis-ofisinden-2-plus1-kat-mulkiyetli-hemen-teslim-644626657/detay
/ilan/emlak-konut-satilik-adalilar-dan-kartal-hurriyet-mah-de-satilik-kelepir-3-plus1-dublex-682761629/detay
/ilan/emlak-konut-satilik-kartal-kordonboyunda-2-plus1-sifir-daire-647037679/detay
/ilan/emlak-konut-satilik-aklife-den_yakacik_carsi_mah_ultra_lux_katta_tek_sifir_2-plus1-654883140/detay
/ilan/emlak-konut-satilik-aklife-den_yakacik_da_mukanbel_yapi_kaliteli_3-plus1_arakat_sifir-657772595/detay
/ilan/emlak-konut-satilik-ciceksan-insaat-dan-3-plus1-daireler-hemen-tapu-hemen-teslim-682770303/detay
/ilan/emlak-konut-satilik-satilik-daire-ofis-2-1-85-mt-klepir-634724740/detay
/ilan/emlak-konut-satilik-ricar-dan%2C7-24-guvenlik%2Cyuzme-havuzu%2Ckapali-otopark%2Csifir%2Csitede-682744629/detay
/ilan/emlak-konut-satilik-ricar-dan%2Cana-cadde-uzeri%2Cgenis%2Cferah%2Csifir%2Clux%2Cara-kat-649504313/detay
/ilan/emlak-konut-satilik-mertcan-dan-e5-e-yurume-mesafesinde-iskanli-2-plus1-sifir-daire-692078490/detay
/ilan/emlak-konut-satilik-kartal-atalar-da-sahile-yurume-mesafesinde-iskanli-masrafsiz-3-plus1-454709956/detay
/ilan/emlak-konut-satilik-tugcan-pala-dan-mesa-kartall-da-satilik-2-kat-buyuk-tip-2-plus1-670434988/detay
/ilan/emlak-konut-satilik-satilik-sifir-daire-soganlik-yeni-mah-2-plus1-kat-mulkiyetli-682522237/detay
So im trying to use python to send a post to a url, i have basically copied everything from my network console in firefox yet i can't get it to work.
import requests
import json
url = 'https://www.example.com/example?handle'
cookie = {'UID':'b56f14d8-02b1-4ae8-ad5e-3485cf0abdbe'}
data ={'example':'example'}
headers = {'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.5',
'Connection':'keep-alive',
'Content-Length':'78',
'Content-Type':'text/plain;charset=UTF-8',
'Cookie':'UID=b56f14d8-02b1-4ae8-ad5e-3485cf0abdbe',
'Host':'www.example.com',
'Referer':'https:/example.com/example',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'}
req = requests.post(url, cookies=cookie, params=json.dumps(data), headers=headers)
print(req)
I have replaced the actual data and url with example.
After running this, i get a big long error:
Traceback (most recent call last):
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32 \lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1321, in getresponse
response.begin()
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 296, in begin
version, status, reason = self._read_status()
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 257, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\ssl.py", line 1052, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\ssl.py", line 911, in read
return self._sslobj.read(len, buffer)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\util\retry.py", line 367, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\packages\six.py", line 685, in reraise
raise value.with_traceback(tb)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\urllib3\connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 1321, in getresponse
response.begin()
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 296, in begin
version, status, reason = self._read_status()
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\http\client.py", line 257, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\ssl.py", line 1052, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\ssl.py", line 911, in read
return self._sslobj.read(len, buffer)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/Kaiser/Documents/python ddoser.py", line 19, in <module>
req = requests.post(url, cookies=cookie, params=json.dumps(data).encode('UTF-8'), headers=headers)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 116, in post
return request('post', url, data=data, json=json, **kwargs)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Kaiser\AppData\Local\Programs\Python\Python37-32\lib\site-packages\requests\adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Does anyone know about what i could be doing wrong. Sorry if its a rookie mistake.
This error occurs when the web server rejects your connection. A web server can reject your connection for various reasons. Most likely, you did not properly mimic the HTTP request the web server was expecting.
You do not have to manually fill in all of the HTTP parameters such content length. The library automatically fills in these fields for you. If you plan to make subsequent HTTP requests using the same cookies, consider creating a session object, which stores and handles the cookies for you:
session = requests.Session()
session.post(url, data=payload)
Also, realize the difference between the data parameter and the params parameter. Since you are sending a POST request, chances are that the web server is expecting the data to be passed via the data parameter (which places the data in the request body rather than the query string in the URL). Another reason a web server may reject your connection is because you sent a HTTP request rather than a HTTPS request, so always ensure everything is correct on your end.
Below the code that i use to send post information and read the response,
import urllib2
from urllib2 import URLError, HTTPError
import urllib
import ssl
mydata=[('funcion',funcion),('codigo',codigo)] #The first is the var name the second is the value
headers = {"Content-type", "application/x-www-form-urlencoded","User-Agent", "Mozilla/5.0",}
handler=urllib2.HTTPHandler(debuglevel=1)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
mydata=urllib.urlencode(mydata)
path=url #the url you want to POST to
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE #In my case i need accept certificates
req=urllib2.Request(path, mydata)
req.add_header("Content-type", "application/x-www-form-urlencoded")
req.add_header("User-Agent", "Mozilla/5.0")
page=urllib2.urlopen(req).read()
My code is supposed to get from specific json like url (output webpage is offering is not JSON which is required). When I get it with connection A it returns me following error:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
self.run()
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 20, in run
main_func(self.counter)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 166, in main_func
total=url_to_dict(url)
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 79, in url_to_dict
data = urllib.request.urlopen(url).read().decode('utf-8')
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 472, in open
response = meth(req, response)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 510, in error
return self._call_chain(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
Interestingly, when I try to get info with connection B it works fine however i get following error after 10000-20000 iterations:
Exception in thread Thread-9:
Traceback (most recent call last):
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\urllib\request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1106, in request
self._send_request(method, url, body, headers)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1151, in _send_request
self.endheaders(body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 1102, in endheaders
self._send_output(message_body)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 934, in _send_output
self.send(msg)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 877, in send
self.connect()
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\http\client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 711, in create_connection
raise err
File "C:\Users\nihadazimli\AppData\Local\Programs\Python\Python35\lib\socket.py", line 702, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
I searched several hours internet for error with connection B Error with connection B mainly occurs because of connection problem or proxy. I tried this solution with several different proxies it did not work either gave the same error after some thousand iterations:
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
The problematic part is following :
class myThread (threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
main_func(self.counter)
def url_to_dict(url):
hdr = {
'User-Agent': 'Chrome/60.0.3112.101 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Safari/537.11 Mozilla/55.0.2',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
data2= urllib.request.Request(url,headers= {'User-Agent': 'Mozilla/5.0'})
# proxy_support = urllib.request.ProxyHandler({"http": "http://61.233.25.166:80"})
# opener = urllib2.build_opener(proxy_support)
# urllib2.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
json_type_string = re.findall('({.*})', data)[0]
json_data = json.loads(json_type_string)
total_page = json_data['data']['totalPage']
return json_data,total_page
def main_func(counter):
proxy_support = urllib.request.ProxyHandler({"http": "http://208.83.106.105:9999"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
for x in range(len(url_list)):
url=url_list[x]
company_name=company_list[x]
total=url_to_dict(url)
total_page=total[1]
for y in range(int(total_page/10)):
index = url.find('config[page]=')
index2 = url.find('&config[reply')
k = y*10
url = url[:index+13] + str(counter+k) + url[index2:]
print(url)
data = url_to_dict(url)
parsed_data = get_data(data)
add_to_mongo(parsed_data,company_name)
What can I do to fix this problem? Also, what is cause of getting Error 404 not found?
Thanks in advance
It's not the answer (still can't commenting) but did you try the 'requests' library? I guess it's more powerfull and newest, so ..
import json
import string
import socket
import requests
from bs4 import BeautifulSoup
# Default header to be used first.
headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}
# Create a session using requests to log in.
with requests.Session() as s:
# Grab new headers and cookies from login page
t = s.get("http://minewind.com/forums/ucp.php?mode=login", headers=headers)
sid = t.cookies['phpbb3_qpac2_sid'] # Store sid to be used in POST data.
# POST data to be sent
payload = {"login": "Login",
"password": "*********",
"redirect": "./ucp.php?mode=login",
"redirect": "index.php",
"sid": sid,
"username": "myusername"
}
# Send POST data to the login page, including proper headers.
s1 = s.post("http://minewind.com/forums/ucp.php?mode=login", data=payload, headers=t.headers)
print (t.headers)
# Check to see if we are really logged in, WHICH WE ARENT!!!! ;_;
s2 = s.get("http://minewind.com/forums/index.php", headers=t.headers)
# Pretty up the code and grab links.
perty = BeautifulSoup(s2.content)
perty.prettify()
for links in perty.find_all('a'):
print (links.get('href'))
I finally configured the POST data properly to my knowledge, but now i'm getting some weird connection errors, any ideas? ERRORS:
Traceback (most recent call last):
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 331, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "C:\Python33\lib\http\client.py", line 1143, in getresponse
response.begin()
File "C:\Python33\lib\http\client.py", line 354, in begin
version, status, reason = self._read_status()
File "C:\Python33\lib\http\client.py", line 316, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python33\lib\socket.py", line 297, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python33\lib\site-packages\requests\adapters.py", line 362, in send
timeout=timeout
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 559, in urlopen
_pool=self, _stacktrace=stacktrace)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\util\retry.py", line 245, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\packages\six.py", line 309, in reraise
raise value.with_traceback(tb)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "C:\Python33\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "C:\Python33\lib\http\client.py", line 1143, in getresponse
response.begin()
File "C:\Python33\lib\http\client.py", line 354, in begin
version, status, reason = self._read_status()
File "C:\Python33\lib\http\client.py", line 316, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python33\lib\socket.py", line 297, in readinto
return self._sock.recv_into(b)
requests.packages.urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, 'An existing con
nection was forcibly closed by the remote host', None, 10054))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Anthony\site.py", line 28, in <module>
s2 = s.get("http://minewind.com/forums/index.php", headers=t.headers)
File "C:\Python33\lib\site-packages\requests\sessions.py", line 469, in get
return self.request('GET', url, **kwargs)
File "C:\Python33\lib\site-packages\requests\sessions.py", line 457, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python33\lib\site-packages\requests\sessions.py", line 569, in send
r = adapter.send(request, **kwargs)
File "C:\Python33\lib\site-packages\requests\adapters.py", line 407, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was for
cibly closed by the remote host', None, 10054))
Any thoughts on why i'm getting these 'connection aborted' errors?
You can just use the user-agent headers for this instance, I complicating it by grabbing the login page headers which wasn't necessary. Also you don't need to know the sid cookie beforehand like I thought you did. You can just include it with the POST data as empty. Just make sure you are inspecting which form data is being passed with firebug or similar utilities stated above.
import requests
from bs4 import BeautifulSoup
import sys
headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36"}
with requests.Session() as s:
payload = {"login": "Login",
"password": "mypassword",
"redirect": "./ucp.php?mode=login",
"redirect": "index.php",
"sid": "",
"username": "myusername"}
url = "http://minewind.com/forums/index.php"
s1 = s.post("http://minewind.com/forums/ucp.php?mode=login", data=payload, headers=headers)
s2 = s.get(url, headers=headers)