Elasticsearch indexing stops on certain index - python

When I try indexing it by bulk_indexing() process it stops on certain index. I changed heap_size in jvm.options it doesn't help.
This is the output:
Product 489 indexed
Product 490 indexed
Product 491 indexed
Product 492 indexed
Product 493 indexed
Product 494 indexed
Product 495 indexed
Product 496 indexed
Product 497 indexed
Product 498 indexed
Product 2280 indexed
POST http://localhost:9200/_bulk [status:N/A request:0.001s]
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/code/products/documents.py", line 93, in bulk_indexing
bulk(client=es, actions=(p.indexing() for p in models.Product.objects.all().iterator()))
File "/usr/local/lib/python3.6/site-packages/elasticsearch/helpers/__init__.py", line 257, in bulk
for ok, item in streaming_bulk(client, actions, **kwargs):
File "/usr/local/lib/python3.6/site-packages/elasticsearch/helpers/__init__.py", line 192, in streaming_bulk
raise_on_error, **kwargs)
File "/usr/local/lib/python3.6/site-packages/elasticsearch/helpers/__init__.py", line 99, in _process_bulk_chunk
raise e
File "/usr/local/lib/python3.6/site-packages/elasticsearch/helpers/__init__.py", line 95, in _process_bulk_chunk
resp = client.bulk('\n'.join(bulk_actions) + '\n', **kwargs)
File "/usr/local/lib/python3.6/site-packages/elasticsearch/client/utils.py", line 76, in _wrapped
return func(*args, params=params, **kwargs)
File "/usr/local/lib/python3.6/site-packages/elasticsearch/client/__init__.py", line 1150, in bulk
headers={'content-type': 'application/x-ndjson'})
File "/usr/local/lib/python3.6/site-packages/elasticsearch/transport.py", line 314, in perform_request
status, headers_response, data = connection.perform_request(method, url, params, body, headers=headers, ignore=ignore, timeout=timeout)
File "/usr/local/lib/python3.6/site-packages/elasticsearch/connection/http_urllib3.py", line 175, in perform_request
raise ConnectionError('N/A', str(e), e)
elasticsearch.exceptions.ConnectionError: ConnectionError(<urllib3.connection.HTTPConnection object at 0x7ffb0733bb38>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7ffb0733bb38>: Failed to establish a new connection: [Errno 111] Connection refused)

The problem was in chunk size . It was by default=500. I increased the size and it solved my problem

Related

Python Web Scraper Intermittent "Connection Aborted" Error

Here is a class I wrote to get billboard hot 100 songs by date.
The class uses requests to get website html text
It then uses beautifulsoup to parse the html
The parsing works well
the problem is the intermittent connection errors
import json
import time
from bs4 import BeautifulSoup
import requests
import datetime as DT
class BillBoardScraper():
def __init__(self) -> None:
self.top_100 = None
self.scraped_chart = None
def _scrape_chart(self, date):
url = 'https://www.billboard.com/charts/hot-100'
headers = {
"User-Agent": "Mozilla/5.0"
}
r = requests.get(
f'{url}/{date}', headers=headers)
bill_board_100_soup = BeautifulSoup(r.text, 'html.parser')
r = None
bill_board_100_results_soup = bill_board_100_soup.find_all(
"div", "o-chart-results-list-row-container")
return bill_board_100_results_soup
def _get_song_and_artist(self, idx):
for result_item in self.scraped_chart[idx].find_all('li'):
segment_struct = [tag.name for tag in result_item if tag.name]
if segment_struct == ['h3', 'span']:
song_and_artist = []
for tag in result_item:
if tag.string.strip():
song_and_artist.append(tag.string.strip())
return song_and_artist
def run_parser_and_archive_data(self, date):
self.top_100 = {}
self.scraped_chart = self._scrape_chart(date)
for i in range(0, 100):
song, artist = self._get_song_and_artist(i)
self.top_100[i+1] = {"track": song, "artist": artist, "date": date}
json_string = json.dumps(self.top_100)
with open(f'data_billboard/billboard_hot100_{date}.json', 'w') as outfile:
json.dump(json_string, outfile)
date = DT.date(2010, 3, 19)
n_weeks = 520
c_week = 1
while c_week <= n_weeks:
print(str(date))
top100 = BillBoardScraper()
top100.run_parser_and_archive_data(str(date))
date = date - DT.timedelta(days=7)
time.sleep(10)
Sporadically I receive the following error. Why does this happen? What can I do to mitigate this? Any feedback is appreciated
Traceback (most recent call last):
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 275, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\util\retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\packages\six.py", line 734, in reraise
raise value.with_traceback(tb)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1344, in getresponse
response.begin()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 306, in begin
version, status, reason = self._read_status()
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 275, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "d:\airflow\plugins\api_billboard100.py", line 55, in <module>
top100.run_parser_and_archive_data(str(date))
File "d:\airflow\plugins\api_billboard100.py", line 39, in run_parser_and_archive_data
self.scraped_chart = self._scrape_chart(date)
File "d:\airflow\plugins\api_billboard100.py", line 20, in _scrape_chart
f'{url}/{date}', headers=headers)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "C:\Users\{USER_NAME}\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object>,'Connection to api.twitter.com timed out. (connect tim eout=60)')

I am looking to retrieve some tweets using some search terms. After collection of around 25,000 tweets, it started producing timeout error. FYI, I am using Academic Research account and Twitter API v1.
I want to understand the cause of the error so that I can retrieve more tweets.
Following is my scraping script:
def scrape(words, orig_text):
# search for tweets between 2015 and 2018
start = datetime.datetime(2015, 1, 1)
end = datetime.datetime(2018, 12, 31)
db = pd.DataFrame(columns=['text', 'hashtags', 'search tag', 'id'])
tweets = tweepy.Cursor(api.search_tweets, q=words, lang="en", tweet_mode='extended').items(25000)
list_tweets = [tweet for tweet in tweets]
for i, tweet in enumerate(list_tweets):
tweet_id = tweet.id
hashtags = tweet.entities['hashtags']
Following is how I make connection:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
Following is the traceback for the error:
Rate limit reached. Sleeping for: 1013
Rate limit reached. Sleeping for: 1014
Rate limit reached. Sleeping for: 1014
Traceback (most recent call last):
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/connection.py", line 175, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/util/connection.py", line 95, in create_connection
raise err
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/util/connection.py", line 85, in create_connection
sock.connect(sa)
socket.timeout: timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/connectionpool.py", line 710, in urlopen
chunked=chunked,
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/connectionpool.py", line 386, in _make_request
self._validate_conn(conn)
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/connectionpool.py", line 1040, in _validate_conn
conn.connect()
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/connection.py", line 358, in connect
conn = self._new_conn()
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/urllib3/connection.py", line 182, in _new_conn
% (self.host, self.timeout),
urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f051a69efd0>, 'Connection to api.twitter.com timed out. (connect tim
eout=60)')
Traceback (most recent call last):
File "twitter_api.py", line 67, in <module>
main(search_words)
File "twitter_api.py", line 52, in main
scrape(search_words[i], orig_text)
File "twitter_api.py", line 20, in scrape
list_tweets = [tweet for tweet in tweets]
File "twitter_api.py", line 20, in <listcomp>
list_tweets = [tweet for tweet in tweets]
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/cursor.py", line 86, in __next__
return self.next()
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/cursor.py", line 286, in next
self.current_page = next(self.page_iterator)
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/cursor.py", line 86, in __next__
return self.next()
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/cursor.py", line 167, in next
data = self.method(max_id=self.max_id, parser=RawParser(), *self.args, **self.kwargs)
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/api.py", line 33, in wrapper
return method(*args, **kwargs)
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/api.py", line 46, in wrapper
return method(*args, **kwargs)
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/api.py", line 1272, in search_tweets
), q=q, **kwargs
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/api.py", line 222, in request
raise TweepyException(f'Failed to send request: {e}').with_traceback(sys.exc_info()[2])
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/tweepy/api.py", line 219, in request
timeout=self.timeout, auth=auth, proxies=self.proxy
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/requests/sessions.py", line 529, in request
resp = self.send(prep, **send_kwargs)
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/requests/sessions.py", line 645, in send
r = adapter.send(request, **kwargs)
File "/home/kgarg8/anaconda3/envs/uda_pytorch/lib/python3.6/site-packages/requests/adapters.py", line 507, in send
raise ConnectTimeout(e, request=request)
tweepy.errors.TweepyException: Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /1.1/search/tweet
s.json?q=%22faith%22+OR+%23faith&max_id=1508152899637063689&lang=en&tweet_mode=extended (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection obj
ect at 0x7f051a69efd0>, 'Connection to api.twitter.com timed out. (connect timeout=60)'))

CKAN-Datapusher HTTP500 error due to "Error - <class 'sqlalchemy.exc.TimeoutError'>: QueuePool size 5 overflow 10 reached, connection timedout" out"

I'm using CKAN v2.7.2(docker). And getting "Error - <class 'sqlalchemy.exc.TimeoutError'>: QueuePool limit of size 5 overflow 10 reached, connection timed out, timeout 30" error due to which HTTP-500 error are coming and request could not be completed.
It seems, after sometime ckan has no connection left to serve a new request that is why this error has occured. Is my understanding correct? If so why it is consuming all the connections?
Are these values(pool_size & max_overflow) are load dependent(resource data or size of resource files)?
Is pool_size value 5 and max_overflow value 10, is not sufficient?
Should we only increase max_overflow value or only pool_size value or we should increase both values to avoid this "HTTP-500 error(sqlalchemy.exc.TimeoutError)".
What should be appropriate value or how we should choose the appropriate value?
datapusher.error.log
Fetching from: https://url/ckan/dataset/8a723839-5ea4-938b-bfa9-7637743dfe12/resource/0797383e-bac0-4114-943b-87c74d2677a9/download/Filelist1.csv
Error notifying listener
Traceback (most recent call last):
File "/usr/lib/ckan/datapusher/lib/python2.7/site-packages/apscheduler/scheduler.py", line 239, in _notify_listeners
cb(event)
File "/usr/lib/ckan/datapusher/lib/python2.7/site-packages/ckanserviceprovider/web.py", line 184, in job_listener
db.mark_job_as_errored(job_id, error_object)
File "/usr/lib/ckan/datapusher/lib/python2.7/site-packages/ckanserviceprovider/db.py", line 413, in mark_job_as_errored
_update_job(job_id, update_dict)
File "/usr/lib/ckan/datapusher/lib/python2.7/site-packages/ckanserviceprovider/db.py", line 348, in _update_job
job_dict["error"] = json.dumps(job_dict["error"])
File "/usr/lib/python2.7/json/__init__.py", line 243, in dumps
return _default_encoder.encode(obj)
File "/usr/lib/python2.7/json/encoder.py", line 207, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/lib/python2.7/json/encoder.py", line 270, in iterencode
return _iterencode(o, 0)
File "/usr/lib/python2.7/json/encoder.py", line 184, in default
raise TypeError(repr(o) + " is not JSON serializable")
TypeError: <Response [500]> is not JSON serializable
Job "push_to_datastore (trigger: RunTriggerNow, run = True, next run at: None)" raised an exception
Traceback (most recent call last):
File "/usr/lib/ckan/datapusher/lib/python2.7/site-packages/apscheduler/scheduler.py", line 512, in _run_job
retval = job.func(*job.args, **job.kwargs)
File "/usr/lib/ckan/datapusher/src/datapusher/datapusher/jobs.py", line 417, in push_to_datastore
existing = datastore_resource_exists(resource_id, api_key, ckan_url)
File "/usr/lib/ckan/datapusher/src/datapusher/datapusher/jobs.py", line 221, in datastore_resource_exists
response.status_code, search_url, response,
HTTPError: Error getting datastore resource. status=500 url=https://url/ckan/api/3/action/datastore_search response=<Response [500]>
ckan_http.error.log
Error - <class 'sqlalchemy.exc.TimeoutError'>: QueuePool limit of size 5 overflow 10 reached, connection timed out, timeout 30
URL: https://url/dataset/dataset1/resource/0797383e-bac0-4114-943b-87c74d2677a9
File '/usr/lib/ckan/default/lib/python2.7/site-packages/weberror/errormiddleware.py', line 171 in __call__
app_iter = self.application(environ, sr_checker)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/dec.py', line 147 in __call__
resp = self.call_func(req, *args, **self.kwargs)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/dec.py', line 208 in call_func
return self.func(req, *args, **kwargs)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/fanstatic/publisher.py', line 234 in __call__
return request.get_response(self.app)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/request.py', line 1053 in get_response
application, catch_exc_info=False)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/request.py', line 1022 in call_application
app_iter = application(self.environ, start_response)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/dec.py', line 147 in __call__
resp = self.call_func(req, *args, **self.kwargs)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/dec.py', line 208 in call_func
return self.func(req, *args, **kwargs)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/fanstatic/injector.py', line 54 in __call__
response = request.get_response(self.app)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/request.py', line 1053 in get_response
application, catch_exc_info=False)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/webob/request.py', line 1022 in call_application
app_iter = application(self.environ, start_response)
File '/usr/lib/ckan/default/src/ckan/ckan/config/middleware/pylons_app.py', line 250 in inner
result = application(environ, start_response)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/beaker/middleware.py', line 73 in __call__
return self.app(environ, start_response)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/beaker/middleware.py', line 156 in __call__
return self.wrap_app(environ, session_start_response)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/routes/middleware.py', line 131 in __call__
response = self.app(environ, start_response)
File '/usr/lib/ckan/default/src/ckan/ckan/config/middleware/common_middleware.py', line 80 in __call__
return self.app(environ, start_response)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/pylons/wsgiapp.py', line 125 in __call__
response = self.dispatch(controller, environ, start_response)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/pylons/wsgiapp.py', line 324 in dispatch
return controller(environ, start_response)
File '/usr/lib/ckan/default/src/ckan/ckan/lib/base.py', line 212 in __call__
res = WSGIController.__call__(self, environ, start_response)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/pylons/controllers/core.py', line 221 in __call__
response = self._dispatch_call()
File '/usr/lib/ckan/default/lib/python2.7/site-packages/pylons/controllers/core.py', line 172 in _dispatch_call
response = self._inspect_call(func)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/pylons/controllers/core.py', line 107 in _inspect_call
result = self._perform_call(func, args)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/pylons/controllers/core.py', line 60 in _perform_call
return func(**args)
File '/usr/lib/ckan/default/src/ckan/ckan/controllers/package.py', line 1123 in resource_read
return render(template, extra_vars=vars)
File '/usr/lib/ckan/default/src/ckan/ckan/lib/base.py', line 177 in render
return cached_template(template_name, render_template)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/pylons/templating.py', line 249 in cached_template
return render_func()
File '/usr/lib/ckan/default/src/ckan/ckan/lib/base.py', line 131 in render_template
return render_jinja2(template_name, globs)
File '/usr/lib/ckan/default/src/ckan/ckan/lib/base.py', line 88 in render_jinja2
return template.render(**extra_vars)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/jinja2/environment.py', line 989 in render
return self.environment.handle_exception(exc_info, True)
File '/usr/lib/ckan/default/lib/python2.7/site-packages/jinja2/environment.py', line 754 in handle_exception
reraise(exc_type, exc_value, tb)
File '/usr/lib/ckan/default/src/ckanext-fiware_theme_basic/ckanext/fiware_theme_basic/templates/package/resource_read.html', line 1 in top-level template code
{% ckan_extends %}
TimeoutError: QueuePool limit of size 5 overflow 10 reached, connection timed out, timeout 30

Counting words on multiple web pages at the same domain

I am writing a simple Python 3 web crawler intended to count the words on various sub-pages of a domain. The idea is to get all the sub-pages on the domain, then iterate through them and count the words in each.
My problem is that I'm getting various errors, such as urllib3.exceptions.NewConnectionError, and that word counts are inaccurate.
Once I'll perfect the code, I'll make it more recursive, to count the words in sub pages of sub pages as well.
I will be grateful for any suggestions to improve my code.
import requests
from collections import Counter
from string import punctuation
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("https://www.paulelliottbooks.com/")
bsObj = BeautifulSoup(html.read(), features="html.parser");
urls=[]
for link in bsObj.find_all('a'):
if link.get('href') not in urls:
urls.append(link.get('href'))
else:
pass
print(urls)
words=0
for url in urls:
specific_url="https://www.paulelliottbooks.com"+url
r = requests.get(specific_url)
soup = BeautifulSoup(r.content, features="html.parser")
text_div = (''.join(s.findAll(text=True)) for s in soup.findAll('div'))
c_div = Counter((x.rstrip(punctuation).lower() for y in text_div for x in y.split()))
word_count=(sum(c_div.values()))
print(specific_url + " " + str(word_count))
words += word_count
print(words)`
Output:
https://www.paulelliottbooks.com/zozergames.html 12152
https://www.paulelliottbooks.com/43-ad.html 9306
https://www.paulelliottbooks.com/warband.html 6142
https://www.paulelliottbooks.com/camp-cretaceous.html 2886
https://www.paulelliottbooks.com/free-rpgs.html 5217
https://www.paulelliottbooks.com/grunt.html 7927
https://www.paulelliottbooks.com/hostile.html 7232
https://www.paulelliottbooks.com/alien-breeds.html 4946
https://www.paulelliottbooks.com/crew-expendable.html 2786
https://www.paulelliottbooks.com/dirtside.html 4682
https://www.paulelliottbooks.com/hot-zone.html 2546
https://www.paulelliottbooks.com/marine-handbook.html 4700
https://www.paulelliottbooks.com/pioneer-class-station.html 4394
https://www.paulelliottbooks.com/roughnecks.html 4406
https://www.paulelliottbooks.com/technical-manual.html 2933
https://www.paulelliottbooks.com/tool-kits.html 2180
https://www.paulelliottbooks.com/zaibatsu.html 8555
https://www.paulelliottbooks.com/hostile-resources.html 3768
https://www.paulelliottbooks.com/low-tech-supplements.html 7142
https://www.paulelliottbooks.com/modern-war.html 3206
https://www.paulelliottbooks.com/orbital.html 8991
https://www.paulelliottbooks.com/far-horizon.html 7113
https://www.paulelliottbooks.com/outpost-mars.html 4513
https://www.paulelliottbooks.com/horizon-survey-craft.html 4778
https://www.paulelliottbooks.com/planetary-tool-kits.html 7581
https://www.paulelliottbooks.com/solo.html 8451
https://www.paulelliottbooks.com/traveller-freebies.html 16155
https://www.paulelliottbooks.com/universal-world-profile.html 8213
https://www.paulelliottbooks.com/zenobia-rpg.html 7760
https://www.paulelliottbooks.com/history-books.html 13427
https://www.paulelliottbooks.com/gallery.html 971
https://www.paulelliottbooks.com/contact.html 914
https://www.paulelliottbooks.com# 556
Traceback (most recent call last):
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\connection.py", line 157, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\util\connection.py", line 61, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.7_3.7.1776.0_x64__qbz5n2kfra8p0\lib\socket.py", line 752, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\connectionpool.py", line 672, in urlopen
chunked=chunked,
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\connectionpool.py", line 994, in _validate_conn
conn.connect()
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\connection.py", line 300, in connect
conn = self._new_conn()
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\connection.py", line 169, in _new_conn
self, "Failed to establish a new connection: %s" % e
urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x00000214FE5B7708>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\requests\adapters.py", line 449, in send
timeout=timeout
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\urllib3\util\retry.py", line 436, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.paulelliottbooks.com_blank', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000214FE5B7708>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/golan/PycharmProjects/crawl_counter/crawl_counter.py", line 21, in <module>
r = requests.get(specific_url)
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "C:\Users\golan\PycharmProjects\crawl_counter\venv\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='www.paulelliottbooks.com_blank', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000214FE5B7708>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Process finished with exit code 1
I think I fixed it myself!
First of all, I made sure the script ignores NULL and _blank URLs, which were causing the error messages.
Then, I did some more research and greatly simplified my word counter, which now seems to do its job accurately.
Any further suggestions about improving my script?
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("https://www.paulelliottbooks.com/")
bsObj = BeautifulSoup(html.read(), features="html.parser");
urls=[]
for link in bsObj.find_all('a'):
if link.get('href') not in urls:
urls.append(link.get('href'))
else:
pass
print(urls)
words=0
for url in urls:
if url not in ["NULL", "_blank"]:
specific_url="https://www.paulelliottbooks.com/"+url
r = requests.get(specific_url)
soup = BeautifulSoup(r.text, features="html.parser")
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
text_list = text.split()
print(f"{specific_url}: {len(text_list)} words")
words += len(text_list)
else:
pass
print(words)
Output:
['/zozergames.html', '/43-ad.html', '/warband.html', '/camp-cretaceous.html', '/free-rpgs.html', '/grunt.html', '/hostile.html', '/alien-breeds.html', '/crew-expendable.html', '/dirtside.html', '/hot-zone.html', '/marine-handbook.html', '/pioneer-class-station.html', '/roughnecks.html', '/technical-manual.html', '/tool-kits.html', '/zaibatsu.html', '/hostile-resources.html', '/low-tech-supplements.html', '/modern-war.html', '/orbital.html', '/far-horizon.html', '/outpost-mars.html', '/horizon-survey-craft.html', '/planetary-tool-kits.html', '/solo.html', '/traveller-freebies.html', '/universal-world-profile.html', '/zenobia-rpg.html', '/history-books.html', '/gallery.html', '/contact.html', '#', '_blank']
https://www.paulelliottbooks.com/zozergames.html: 1148 words
https://www.paulelliottbooks.com/43-ad.html: 933 words
https://www.paulelliottbooks.com/warband.html: 610 words
https://www.paulelliottbooks.com/camp-cretaceous.html: 328 words
https://www.paulelliottbooks.com/free-rpgs.html: 535 words
https://www.paulelliottbooks.com/grunt.html: 811 words
https://www.paulelliottbooks.com/hostile.html: 726 words
https://www.paulelliottbooks.com/alien-breeds.html: 491 words
https://www.paulelliottbooks.com/crew-expendable.html: 311 words
https://www.paulelliottbooks.com/dirtside.html: 468 words
https://www.paulelliottbooks.com/hot-zone.html: 291 words
https://www.paulelliottbooks.com/marine-handbook.html: 470 words
https://www.paulelliottbooks.com/pioneer-class-station.html: 446 words
https://www.paulelliottbooks.com/roughnecks.html: 445 words
https://www.paulelliottbooks.com/technical-manual.html: 324 words
https://www.paulelliottbooks.com/tool-kits.html: 260 words
https://www.paulelliottbooks.com/zaibatsu.html: 792 words
https://www.paulelliottbooks.com/hostile-resources.html: 408 words
https://www.paulelliottbooks.com/low-tech-supplements.html: 678 words
https://www.paulelliottbooks.com/modern-war.html: 346 words
https://www.paulelliottbooks.com/orbital.html: 943 words
https://www.paulelliottbooks.com/far-horizon.html: 716 words
https://www.paulelliottbooks.com/outpost-mars.html: 518 words
https://www.paulelliottbooks.com/horizon-survey-craft.html: 497 words
https://www.paulelliottbooks.com/planetary-tool-kits.html: 831 words
https://www.paulelliottbooks.com/solo.html: 784 words
https://www.paulelliottbooks.com/traveller-freebies.html: 1490 words
https://www.paulelliottbooks.com/universal-world-profile.html: 826 words
https://www.paulelliottbooks.com/zenobia-rpg.html: 726 words
https://www.paulelliottbooks.com/history-books.html: 1207 words
https://www.paulelliottbooks.com/gallery.html: 161 words
https://www.paulelliottbooks.com/contact.html: 157 words
https://www.paulelliottbooks.com#: 127 words
19804
Process finished with exit code 0

Python - Beautiful Soup Returning Errors

I want to extract the covers for different journals on the cambridge university press website. The I want to save it as it's online ISSN. The following code works but after one or two journals, it gives me this error:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 141, in _new_conn
(self.host, self.port), self.timeout, **extra_kw)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\conne
ction.py", line 60, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 745, in getaddr
info
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11004] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 601, in urlopen
chunked=chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 357, in _make_request
conn.request(method, url, **httplib_request_kw)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1239, in r
equest
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1285, in _
send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1234, in e
ndheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1026, in _
send_output
self.send(msg)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 964, in se
nd
self.connect()
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 166, in connect
conn = self._new_conn()
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 150, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x030DB770>: Fai
led to establish a new connection: [Errno 11004] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.
py", line 440, in send
timeout=timeout
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\retry
.py", line 388, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='ore', port=80): Max retries exceeded with
url: /services/aop-file-manager/file/57f386d3efeebb2f18eac486 (Caused by NewConnectionError('<urlli
b3.connection.HTTPConnection object at 0x030DB770>: Failed to establish a new connection: [Errno 110
04] getaddrinfo failed',))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\Documents\Python\python_work\Kudos\CUPgetcovers.py", line 19, in <module>
f.write(requests.get("http://" + imagefound).content)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py",
line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py",
line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.
py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.
py", line 618, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.
py", line 508, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='ore', port=80): Max retries exceeded w
ith url: /services/aop-file-manager/file/57f386d3efeebb2f18eac486 (Caused by NewConnectionError('<ur
llib3.connection.HTTPConnection object at 0x030DB770>: Failed to establish a new connection: [Errno
11004] getaddrinfo failed',))
Process returned 1 (0x1) execution time : 4.373 s
Press any key to continue . . .
What am I doing wrong? I could not find any answers on google. It was working fine before.
Thank you in advance.
Edit:
launch.py:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
import csv
import requests
from time import sleep
with open('listoflinks.csv', encoding="utf8") as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
http = httplib2.Http()
status, response = http.request(("https://www.cambridge.org" + row[0]))
soup = BeautifulSoup(response, "html.parser")
txt = (t.text for t in soup.find_all("span", class_="value"))
issn = next(t[:9] for t in txt if t.endswith("(Online)"))
for a in soup.find_all('a', attrs={'class' : 'image'}):
if a.img:
imagefound = (a.img['src'])
imagefound = imagefound[2:]
f = open((issn + ".jpg"),'wb')
f.write(requests.get("http://" + imagefound).content)
f.close()
listoflinks.csv:
/core/journals/journal-of-materials-research
/core/journals/journal-of-mechanics
/core/journals/journal-of-modern-african-studies
/core/journals/journal-of-navigation
/core/journals/journal-of-nutritional-science
/core/journals/journal-of-pacific-rim-psychology
/core/journals/journal-of-paleontology
/core/journals/journal-of-pension-economics-and-finance
/core/journals/journal-of-plasma-physics
/core/journals/journal-of-policy-history
/core/journals/journal-of-psychologists-and-counsellors-in-schools
/core/journals/journal-of-public-policy
/core/journals/journal-of-race-ethnicity-and-politics
/core/journals/journal-of-radiotherapy-in-practice
/core/journals/journal-of-relationships-research
/core/journals/journal-of-roman-archaeology
/core/journals/journal-of-roman-studies
/core/journals/journal-of-smoking-cessation
/core/journals/journal-of-social-policy
/core/journals/journal-of-southeast-asian-studies
/core/journals/journal-of-symbolic-logic
/core/journals/journal-of-the-american-philosophical-association
/core/journals/journal-of-the-australian-mathematical-society
/core/journals/journal-of-the-gilded-age-and-progressive-era
/core/journals/journal-of-the-history-of-economic-thought
/core/journals/journal-of-the-institute-of-mathematics-of-jussieu
/core/journals/journal-of-the-international-neuropsychological-society
/core/journals/journal-of-the-international-phonetic-association
/core/journals/journal-of-the-marine-biological-association-of-the-united-kingdom
/core/journals/journal-of-the-royal-asiatic-society
/core/journals/journal-of-the-society-for-american-music
/core/journals/journal-of-tropical-ecology
/core/journals/journal-of-tropical-psychology
/core/journals/journal-of-wine-economics
/core/journals/kantian-review
/core/journals/knowledge-engineering-review
/core/journals/language-and-cognition
/core/journals/language-in-society
/core/journals/language-teaching
/core/journals/language-variation-and-change
/core/journals/laser-and-particle-beams
/core/journals/latin-american-antiquity
/core/journals/latin-american-politics-and-society
/core/journals/law-and-history-review
/core/journals/legal-information-management
/core/journals/legal-studies
/core/journals/legal-theory
/core/journals/leiden-journal-of-international-law
/core/journals/libyan-studies
/core/journals/lichenologist
/core/journals/lms-journal-of-computation-and-mathematics
/core/journals/macroeconomic-dynamics
/core/journals/management-and-organization-review
/core/journals/mathematical-gazette
/core/journals/mathematical-proceedings-of-the-cambridge-philosophical-society
/core/journals/mathematical-structures-in-computer-science
/core/journals/mathematika
/core/journals/medical-history
/core/journals/medical-history-supplements
/core/journals/melanges-d-histoire-sociale
/core/journals/microscopy-and-microanalysis
/core/journals/microscopy-today
/core/journals/mineralogical-magazine
/core/journals/modern-american-history
/core/journals/modern-asian-studies
/core/journals/modern-intellectual-history
/core/journals/modern-italy
/core/journals/mrs-advances
/core/journals/mrs-bulletin
/core/journals/mrs-communications
/core/journals/mrs-energy-and-sustainability
/core/journals/mrs-online-proceedings-library-archive
/core/journals/nagoya-mathematical-journal
/core/journals/natural-language-engineering
/core/journals/netherlands-journal-of-geosciences
/core/journals/network-science
/core/journals/new-perspectives-on-turkey
/core/journals/new-surveys-in-the-classics
/core/journals/new-testament-studies
/core/journals/new-theatre-quarterly
/core/journals/nineteenth-century-music-review
/core/journals/nordic-journal-of-linguistics
/core/journals/numerical-mathematics-theory-methods-and-applications
/core/journals/nutrition-research-reviews
/core/journals/organised-sound
/core/journals/oryx
/core/journals/paleobiology
/core/journals/the-paleontological-society-papers
/core/journals/palliative-and-supportive-care
/core/journals/papers-of-the-british-school-at-rome
/core/journals/parasitology
/core/journals/parasitology-open
/core/journals/personality-neuroscience
/core/journals/perspectives-on-politics
/core/journals/philosophy
/core/journals/phonology
/core/journals/plainsong-and-medieval-music
/core/journals/plant-genetic-resources
/core/journals/polar-record
/core/journals/political-analysis
/core/journals/political-science-research-and-methods
/core/journals/politics-and-gender
/core/journals/politics-and-religion
/core/journals/politics-and-the-life-sciences
/core/journals/popular-music
/core/journals/powder-diffraction
/core/journals/prehospital-and-disaster-medicine
/core/journals/primary-health-care-research-and-development
/core/journals/probability-in-the-engineering-and-informational-sciences
/core/journals/proceedings-of-the-asil-annual-meeting
/core/journals/proceedings-of-the-edinburgh-mathematical-society
/core/journals/proceedings-of-the-international-astronomical-union
/core/journals/proceedings-of-the-nutrition-society
/core/journals/proceedings-of-the-prehistoric-society
/core/journals/proceedings-of-the-royal-society-of-edinburgh-section-a-mathematics
/core/journals/ps-political-science-and-politics
/core/journals/psychological-medicine
/core/journals/public-health-nutrition
/core/journals/publications-of-the-astronomical-society-of-australia
/core/journals/quarterly-reviews-of-biophysics
/core/journals/quaternary-research
/core/journals/queensland-review
/core/journals/radiocarbon
/core/journals/ramus
/core/journals/recall
/core/journals/religious-studies
/core/journals/renewable-agriculture-and-food-systems
/core/journals/review-of-international-studies
/core/journals/review-of-middle-east-studies
/core/journals/review-of-politics
/core/journals/review-of-symbolic-logic
/core/journals/revista-de-historia-economica-journal-of-iberian-and-latin-american-economic-history
/core/journals/robotica
/core/journals/royal-historical-society-camden-fifth-series
/core/journals/royal-institute-of-philosophy-supplements
/core/journals/rural-history
/core/journals/science-in-context
/core/journals/scottish-journal-of-theology
/core/journals/seed-science-research
/core/journals/slavic-review
/core/journals/social-philosophy-and-policy
/core/journals/social-policy-and-society
/core/journals/social-science-history
/core/journals/spanish-journal-of-psychology
/core/journals/studies-in-american-political-development
/core/journals/studies-in-church-history
/core/journals/studies-in-second-language-acquisition
/core/journals/tempo
/core/journals/theatre-research-international
/core/journals/theatre-survey
/core/journals/theory-and-practice-of-logic-programming
/core/journals/think
/core/journals/traditio
/core/journals/trans-trans-regional-and-national-studies-of-southeast-asia
/core/journals/transactions-of-the-royal-historical-society
/core/journals/transnational-environmental-law
/core/journals/twentieth-century-music
/core/journals/twin-research-and-human-genetics
/core/journals/urban-history
/core/journals/utilitas
/core/journals/victorian-literature-and-culture
/core/journals/visual-neuroscience
/core/journals/weed-science
/core/journals/weed-technology
/core/journals/wireless-power-transfer
/core/journals/world-politics
/core/journals/world-s-poultry-science-journal
/core/journals/world-trade-review
/core/journals/zygote
You should simplify your code and your scraping strategy, although I can see that not all journal pages have the same structure. On most pages you can get the ISSN easily through a form value. On others (free access, I think) you need to apply some kind of heuristics to get the ISSN. Also I don't know why you are using httplib2 and requests as both provide more or less the same functionality. Anyway here's some code that does what you want ... kind of (I have also removed the CSV code because as it is there's not need for that):
import requests
from bs4 import BeautifulSoup, SoupStrainer
with open('listoflinks.csv', encoding="utf8") as f:
for line in f:
path = line.strip()
print("getting", path)
response = requests.get("https://www.cambridge.org" + path)
soup = BeautifulSoup(response.text, "html.parser")
try:
issn = soup.find("input", attrs={'name': 'productIssn'}).get('value')
except:
values = soup.find_all("span", class_="value")
for v in values:
if "(Online)" in v.string:
issn = v.string.split(" ")[0]
break
print("issn:", issn)
details_container = soup.find("div", class_="details-container")
image = details_container.find("img")
imgurl = image['src'][2:]
print("imgurl:", imgurl)
with open(issn + ".jpg", 'wb') as output:
output.write(requests.get("http://" + imgurl).content)

Categories