Multi-thread in Python [duplicate] - python

This question already has answers here:
No schema supplied and other errors with using requests.get()
(6 answers)
Closed 6 years ago.
I'm following the book "Automate the boring tasks with Python" and I'm trying to create a progrma that downloads multiple comics from http://xkcd.com
simultaneously, but has ran into some problems. I'm copying the exact same program as it is on the book.
Here's my code:
# multidownloadXkcd.py - Downloads XKCD comics using multiple threads.
import requests, os ,bs4, threading
os.chdir('c:\\users\\patty\\desktop')
os.makedirs('xkcd', exist_ok=True) # store comics on ./xkcd
def downloadXkcd(startComic, endComic):
for urlNumber in range(startComic, endComic):
#Download the page
print('Downloading page http://xkcd.com/%s...' %(urlNumber))
res = requests.get('http://xkcd.com/%s' % (urlNumber))
res.raise_for_status()
soup= bs4.BeautifulSoup(res.text, "html.parser")
#Find the URL of the comic image.
comicElem = soup.select('#comic img')
if comicElem == []:
print('Could not find comic image.')
else:
comicUrl = comicElem[0].get('src')
#Download the image.
print('Downloading image %s...' % (comicUrl))
res = requests.get(comicUrl, "html.parser")
res.raise_for_status()
#Save the image to ./xkcd.
imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
downloadThreads = [] # a list of all the Thread objects
for i in range(0,1400, 100): # loops 14 times, creates 14 threads
downloadThread = threading.Thread(target=downloadXkcd, args=(i, i + 99))
downloadThreads.append(downloadThread)
downloadThread.start()
# Wait for all threads to end.
for downloadThread in downloadThreads:
downloadThread.join()
print('Done.')
I'm getting the following exception:
Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
self.run()
File "C:\Python\Python35\lib\threading.py", line 862, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\PATTY\PycharmProjects\CH15_TASKS\practice.py", line 13, in downloadXkcd
res.raise_for_status()
File "C:\Python\Python35\lib\site-packages\requests\models.py", line 862, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 404 Client Error: Not Found for url: http://xkcd.com/0
Exception in thread Thread-2:
Traceback (most recent call last):
File "C:\Python\Python35\lib\threading.py", line 914, in _bootstrap_inner
self.run()
File "C:\Python\Python35\lib\threading.py", line 862, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\PATTY\PycharmProjects\CH15_TASKS\practice.py", line 25, in downloadXkcd
res = requests.get(comicUrl, "html.parser")
File "C:\Python\Python35\lib\site-packages\requests\api.py", line 70, in get
return request('get', url, params=params, **kwargs)
File "C:\Python\Python35\lib\site-packages\requests\api.py", line 56, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python\Python35\lib\site-packages\requests\sessions.py", line 461, in request
prep = self.prepare_request(req)
File "C:\Python\Python35\lib\site-packages\requests\sessions.py", line 394, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Python\Python35\lib\site-packages\requests\models.py", line 294, in prepare
self.prepare_url(url, params)
File "C:\Python\Python35\lib\site-packages\requests\models.py", line 354, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '//imgs.xkcd.com/comics/family_circus.jpg': No schema supplied. Perhaps you meant http:////imgs.xkcd.com/comics/family_circus.jpg?
It's says that the URL is invalid but whenever I copy paste that url into the webrowser it seems to be valid. Anyone know how would I fix this? Thanks

Yea , as #spectras said , just because your url fixes your url it doesn't mean that is valid.
Try using a "http://www." before it and try to see if its working.

Related

Receiving EPIPE error when streaming from PSQL copy function

I am trying to write a streaming implementation of dumping a table from psql into a pre-signed URL on S3. Unfortunately, it seems to error out at a seemingly random time in the upload. I have tried many combinations of opening/closing the file descriptors at different times. I for the life of me cannot figure out why this is occurring.
The strangest thing is when I mock the requests library and analyze the sent data, it works as intended. The socket is raising an EPIPE error at a certain amount through the stream
from psycopg2 import connect
import threading
import requests
import requests_mock
import traceback
from base64 import b64decode
from boto3 import session
r_fd, w_fd = os.pipe()
connection = connect(host='host', database='db',
user='user', password='pw')
cursor = connection.cursor()
b3_session = session.Session(profile_name='profile', region_name='us-east-1')
url = b3_session.client('s3').generate_presigned_url(
ClientMethod='put_object',
Params={'Bucket': 'bucket', 'Key': 'test_streaming_upload.txt'},
ExpiresIn=3600)
rd = os.fdopen(r_fd, 'rb')
wd = os.fdopen(w_fd, 'wb')
def stream_data():
print('Starting stream')
with os.fdopen(r_fd, 'rb') as rd:
requests.put(url, data=rd, headers={'Content-type': 'application/octet-stream'})
print('Ending stream')
to_thread = threading.Thread(target=stream_data)
to_thread.start()
print('Starting copy')
with os.fdopen(w_fd, 'wb') as wd:
cursor.copy_expert('COPY table TO STDOUT WITH CSV HEADER', wd)
print('Ending copy')
to_thread.join()
The output is always the same:
Starting stream
Starting copy
Exception in thread Thread-1:
Traceback (most recent call last):
File "/venv/lib/python3.9/site-packages/urllib3/contrib/pyopenssl.py", line 342, in _send_until_done
return self.connection.send(data)
File "/venv/lib/python3.9/site-packages/OpenSSL/SSL.py", line 1718, in send
self._raise_ssl_error(self._ssl, result)
File "/venv/lib/python3.9/site-packages/OpenSSL/SSL.py", line 1624, in _raise_ssl_error
raise SysCallError(errno, errorcode.get(errno))
OpenSSL.SSL.SysCallError: (32, 'EPIPE')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/venv/lib/python3.9/site-packages/requests/adapters.py", line 473, in send
low_conn.send(b'\r\n')
File "/Users/me/.pyenv/versions/3.9.7/lib/python3.9/http/client.py", line 995, in send
self.sock.sendall(data)
File "/venv/lib/python3.9/site-packages/urllib3/contrib/pyopenssl.py", line 354, in sendall
sent = self._send_until_done(
File "/venv/lib/python3.9/site-packages/urllib3/contrib/pyopenssl.py", line 349, in _send_until_done
raise SocketError(str(e))
OSError: (32, 'EPIPE')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/me/.pyenv/versions/3.9.7/lib/python3.9/threading.py", line 973, in _bootstrap_inner
self.run()
File "/Users/me/.pyenv/versions/3.9.7/lib/python3.9/threading.py", line 910, in run
self._target(*self._args, **self._kwargs)
File "/Users/me/Library/Application Support/JetBrains/PyCharm2021.2/scratches/scratch_60.py", line 37, in stream_data
requests.put(url, data=rd, headers={'Content-type': 'application/octet-stream'})
File "/venv/lib/python3.9/site-packages/requests/api.py", line 131, in put
return request('put', url, data=data, **kwargs)
File "/venv/lib/python3.9/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/venv/lib/python3.9/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/venv/lib/python3.9/site-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/venv/lib/python3.9/site-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: (32, 'EPIPE')
Am I missing something obvious? Is this a memory error? I appreciate any insight I can get because this is killing me. I can verify that the socket is being written to anywhere from 1.5 to 2.5k times before this error occurs.

problems downloading large files with requests?

I'm trying to download a video file using an API, the equivalent curl command works without problem, the python code below works without error for small videos:
with requests.get("http://username:password#url/Download/", data=data, stream=True) as r:
r.raise_for_status()
with open("deliverables/video_output34.mp4", "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
it fails for large videos (failed for video ~34M) (the equivalent curl command works for this one)
Traceback (most recent call last):
File "/home/nabil/.local/lib/python3.7/site-packages/requests/adapters.py", line 479, in send
r = low_conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/nabil/.local/lib/python3.7/site-packages/requests/adapters.py", line 482, in send
r = low_conn.getresponse()
File "/usr/local/lib/python3.7/http/client.py", line 1321, in getresponse
response.begin()
File "/usr/local/lib/python3.7/http/client.py", line 296, in begin
version, status, reason = self._read_status()
File "/usr/local/lib/python3.7/http/client.py", line 265, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/nabil/.local/lib/python3.7/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/home/nabil/.local/lib/python3.7/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/home/nabil/.local/lib/python3.7/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/home/nabil/.local/lib/python3.7/site-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/home/nabil/.local/lib/python3.7/site-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: Remote end closed connection without response
I've checked links like the following without success
Thanks to SilentGhost on IRC#python who pointed out to this suggesting I should upgrade my requests, which solved it(from 2.22.0 to 2.24.0).
upgrading the package is done like this:
pip install requests --upgrade
Another source that may help someone looking at this question is to use pycurl, here is a good starting point: https://github.com/rajatkhanduja/PyCurl-Downloader
or/and you can use --libcurl to your curl command to get a good indication on how to use pycurl

Trying to test some url addresses is working or not with python request but getting errors

I'm trying to learn the test some internet addresses with python request and expecting some outputs (like 200 or 404). But i get errors which i couldn't figured out. I'm also open to any advice for my purpose.
import os , sys , requests
from multiprocessing import Pool
def url_check(url):
resp = requests.get(url)
print(resp.status_code)
with Pool(4) as p:
print(p.map(url_check, [ "https://api.github.com​", "​http://bilgisayar.mu.edu.tr/​", "​https://www.python.org/​", "http://akrepnalan.com/ceng2034​", "https://github.com/caesarsalad/wow​" ]))
Output of the code with errors:
404
404
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/usr/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "ödev_deneme.py", line 6, in url_check
resp = requests.get(url)
File "/home/efe/.local/lib/python3.6/site-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/home/efe/.local/lib/python3.6/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/efe/.local/lib/python3.6/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/home/efe/.local/lib/python3.6/site-packages/requests/sessions.py", line 637, in send
adapter = self.get_adapter(url=request.url)
File "/home/efe/.local/lib/python3.6/site-packages/requests/sessions.py", line 728, in get_adapter
raise InvalidSchema("No connection adapters were found for {!r}".format(url))
requests.exceptions.InvalidSchema: No connection adapters were found for '\u200bhttps://www.python.org/\u200b'
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "ödev_deneme.py", line 10, in <module>
print(p.map(url_check, [ "https://api.github.com​", "​http://bilgisayar.mu.edu.tr/​", "​https://www.python.org/​", "http://akrepnalan.com/ceng2034​", "https://github.com/caesarsalad/wow​" ]))
File "/usr/lib/python3.6/multiprocessing/pool.py", line 266, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/lib/python3.6/multiprocessing/pool.py", line 644, in get
raise self._value
requests.exceptions.InvalidSchema: No connection adapters were found for '\u200bhttps://www.python.org/\u200b'
My expecting output must be like this:
200
200
200
404
200
There is 404 on Fourth line because forth url address is not working. But in my output there are already 404 in first two line. There is a huge mistake in my code i guess.
The problem is that some of the urls include invisible ZERO WIDTH SPACE characters ('\u200b').
You can replace them with an empty string:
def url_check(url):
resp = requests.get(url.replace('\u200b', ''))
print(resp.status_code)

Handling Article Exceptions in Newspaper

I have a bit of code that uses newspaper to go take a look at various media outlets and download articles from them. This has been working fine for a long time but has recently started acting up. I can see what the problem is but as I'm new to Python I'm not sure about the best way to address it. Basically (I think) I need to make a modification to keep the occasional malformed web address from crashing the script entirely and instead allow it to dispense with that web address and move on to the others.
The origins of the error is when I attempt to download an article using:
article.download()
Some articles (they change every day obviously) will throw the following error but the script continues to run:
Traceback (most recent call last):
File "C:\Anaconda3\lib\encodings\idna.py", line 167, in encode
raise UnicodeError("label too long")
UnicodeError: label too long
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Anaconda3\lib\site-packages\newspaper\mthreading.py", line 38, in run
func(*args, **kargs)
File "C:\Anaconda3\lib\site-packages\newspaper\source.py", line 350, in download_articles
html = network.get_html(url, config=self.config)
File "C:\Anaconda3\lib\site-packages\newspaper\network.py", line 39, in get_html return get_html_2XX_only(url, config, response)
File "C:\Anaconda3\lib\site-packages\newspaper\network.py", line 60, in get_html_2XX_only url=url, **get_request_kwargs(timeout, useragent))
File "C:\Anaconda3\lib\site-packages\requests\api.py", line 72, in get return request('get', url, params=params, **kwargs)
File "C:\Anaconda3\lib\site-packages\requests\api.py", line 58, in request return session.request(method=method, url=url, **kwargs)
File "C:\Anaconda3\lib\site-packages\requests\sessions.py", line 502, in request resp = self.send(prep, **send_kwargs)
File "C:\Anaconda3\lib\site-packages\requests\sessions.py", line 612, in send r = adapter.send(request, **kwargs)
File "C:\Anaconda3\lib\site-packages\requests\adapters.py", line 440, in send timeout=timeout
File "C:\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen chunked=chunked)
File "C:\Anaconda3\lib\site-packages\urllib3\connectionpool.py", line 356, in _make_request conn.request(method, url, **httplib_request_kw)
File "C:\Anaconda3\lib\http\client.py", line 1107, in request self._send_request(method, url, body, headers)
File "C:\Anaconda3\lib\http\client.py", line 1152, in _send_request self.endheaders(body)
File "C:\Anaconda3\lib\http\client.py", line 1103, in endheaders self._send_output(message_body)
File "C:\Anaconda3\lib\http\client.py", line 934, in _send_output self.send(msg)
File "C:\Anaconda3\lib\http\client.py", line 877, in send self.connect()
File "C:\Anaconda3\lib\site-packages\urllib3\connection.py", line 166, in connect conn = self._new_conn()
File "C:\Anaconda3\lib\site-packages\urllib3\connection.py", line 141, in _new_conn (self.host, self.port), self.timeout, **extra_kw)
File "C:\Anaconda3\lib\site-packages\urllib3\util\connection.py", line 60, in create_connection for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Anaconda3\lib\socket.py", line 733, in getaddrinfo for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
UnicodeError: encoding with 'idna' codec failed (UnicodeError: label too long)
The next bit is supposed to then parse and run natural language processing on each article and write certain elements to a dataframe so I then have:
for paper in papers:
for article in paper.articles:
article.parse()
print(article.title)
article.nlp()
if article.publish_date is None:
d = datetime.now().date()
else:
d = article.publish_date.date()
stories.loc[i] = [paper.brand, d, datetime.now().date(), article.title, article.summary, article.keywords, article.url]
i += 1
(This might be a little sloppy too but that's a problem for another day)
This runs fine until it gets to one of those URLs with the error and then tosses an article exception and the script crashes:
C:\Anaconda3\lib\site-packages\PIL\TiffImagePlugin.py:709: UserWarning: Corrupt EXIF data. Expecting to read 2 bytes but only got 0.
warnings.warn(str(msg))
ArticleException Traceback (most recent call last) <ipython-input-17-2106485c4bbb> in <module>()
4 for paper in papers:
5 for article in paper.articles:
----> 6 article.parse()
7 print(article.title)
8 article.nlp()
C:\Anaconda3\lib\site-packages\newspaper\article.py in parse(self)
183
184 def parse(self):
--> 185 self.throw_if_not_downloaded_verbose()
186
187 self.doc = self.config.get_parser().fromstring(self.html)
C:\Anaconda3\lib\site-packages\newspaper\article.py in throw_if_not_downloaded_verbose(self)
519 if self.download_state == ArticleDownloadState.NOT_STARTED:
520 print('You must `download()` an article first!')
--> 521 raise ArticleException()
522 elif self.download_state == ArticleDownloadState.FAILED_RESPONSE:
523 print('Article `download()` failed with %s on URL %s' %
ArticleException:
So what's the best way to keep this from terminating my script? Should I address it in the download stage where I'm getting the unicode error or at the parse stage by telling it to overlook those bad addresses? And how would I go about implementing that correction?
Really appreciate any advice.
I had the same issue and although in general using except: pass is not recommended, the following worked for me:
try:
a.parse()
file.write( a.title+'\n')
except :
pass
What I've found is that Navid is correct for this exact problem.
However .parse() is only one of the functions that can trip you up. I wrap all the calls inside of the try / accept structure like this:
word_list = []
for words in google_news.articles:
try:
words.download()
words.parse()
words.nlp()
except:
pass
word_list.append(words.keywords)
You can try catching the ArticleException. Don't forget to import the newspaper module.
try:
article.download()
article.parse()
except newspaper.article.ArticleException:
# do something

BeautifulSoup timing out on instantiation?

I'm just doing some web scraping with BeautifulSoup and I'm running into a weird error. Code:
print "Running urllib2"
g = urllib2.urlopen(link + "about", timeout=5)
print "Finished urllib2"
about_soup = BeautifulSoup(g, 'lxml')
Here's the output:
Running urllib2
Finished urllib2
Error
Traceback (most recent call last):
File "/Users/pspieker/Documents/projects/ThePyStrikesBack/tests/TestSpringerOpenScraper.py", line 10, in test_strip_chars
for row in self.instance.get_entries():
File "/Users/pspieker/Documents/projects/ThePyStrikesBack/src/JournalScrapers.py", line 304, in get_entries
about_soup = BeautifulSoup(g, 'lxml')
File "/Users/pspieker/.virtualenvs/thepystrikesback/lib/python2.7/site-packages/bs4/__init__.py", line 175, in __init__
markup = markup.read()
File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 355, in read
data = self._sock.recv(rbufsize)
File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 588, in read
return self._read_chunked(amt)
File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 648, in _read_chunked
value.append(self._safe_read(amt))
File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 703, in _safe_read
chunk = self.fp.read(min(amt, MAXAMOUNT))
File "/usr/local/Cellar/python/2.7.11/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 384, in read
data = self._sock.recv(left)
timeout: timed out
I understand that the urllib2.urlopen could be causing problems, but the exception occurs in the line instantiating BeautifulSoup. I did some googling but couldn't find anything about BeautfiulSoup timeout issues.
Any ideas on what is happening?
This is urllib2 part that causing the timeout.
The reason you see it is failing on the BeautifulSoup instantiation line is that g, the file-like object, is being read by BeautifulSoup internally. This is the part of the stacktrace proving that:
File "/Users/pspieker/.virtualenvs/thepystrikesback/lib/python2.7/site-packages/bs4/__init__.py", line 175, in __init__
markup = markup.read()

Categories