python scrape data usa gov - python

I get following error
'IOError: [Errno socket error] [Errno -2] Name or service not known'
when I try to scrape data from:
http://1usagov.measuredvoice.com/2013/
any idea what may be the reason?
here is my code:
import urllib
import re
import time
data = urllib.urlopen('http://1usagov.measuredvoice.com/2013/').read()
# print data
print data
# datafiles name pattern - usagov_bitly_data2011-07-29-1311919454
p = re.compile('usagov_bitly_data\d{4}-\d{2}-\d{2}-\d{10}')
# print p.findall('<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td>usagov_bitly_data2011-07-29-1311919454</td><td align="right">29-Jul-2011 07:04 </td><td')
m = p.findall(data)
# print m
for i in range(len(m)):
if (i % 2 == 0):
print m[i]
# time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime(epoch))
print len(m)
for i in range(len(m)):
if (i % 2 == 0):
print "downloading ", m[i]
clicks = urllib.urlopen('http://bitly.measuredvoice.com/bitly_archive/' + m[i]).read()
file = open(m[i], "w")
file.write(clicks)
file.close()
print "done"
edit:
full traceback added
Traceback (most recent call last):
File "scrape.py", line 6, in <module>
data = urllib.urlopen('http://1usagov.measuredvoice.com/2013/').read()
File "/usr/lib/python2.7/urllib.py", line 87, in urlopen
return opener.open(url)
File "/usr/lib/python2.7/urllib.py", line 208, in open
return getattr(self, name)(url)
File "/usr/lib/python2.7/urllib.py", line 345, in open_http
h.endheaders(data)
File "/usr/lib/python2.7/httplib.py", line 975, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 835, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 797, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 778, in connect
self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno -2] Name or service not known

Related

Docker python error. urllib.error.URLError: <urlopen error [Errno 99] Cannot assign requested address>

I was trying to run a script in a docker container and got this.
Traceback (most recent call last):
File "experiments/caffemodel2pytorch.py", line 387, in <module>
net_param = initialize(args.caffe_proto).NetParameter()
File "experiments/caffemodel2pytorch.py", line 35, in initialize
mybytes = urlopen(caffe_proto).read()
File "/usr/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/usr/lib/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/usr/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/usr/lib/python3.6/urllib/request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib/python3.6/urllib/request.py", line 1320, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 99] Cannot assign requested address>
Part of the code is here.
def initialize(caffe_proto = 'https://raw.githubusercontent.com/BVLC/caffe/master/src/caffe/proto/caffe.proto', codegen_dir = tempfile.mkdtemp(), shadow_caffe = True):
global caffe_pb2
if caffe_pb2 is None:
local_caffe_proto = os.path.join(codegen_dir, os.path.basename(caffe_proto))
with open(local_caffe_proto, 'w') as f:
mybytes = urlopen(caffe_proto).read()
mystr = mybytes.decode('ascii', 'ignore')
f.write(mystr)
#f.write((urlopen if 'http' in caffe_proto else open)(caffe_proto).read())
subprocess.check_call(['protoc', '--proto_path', os.path.dirname(local_caffe_proto), '--python_out', codegen_dir, local_caffe_proto])
sys.path.insert(0, codegen_dir)
old_pool = google.protobuf.descriptor._message.default_pool
old_symdb = google.protobuf.symbol_database._DEFAULT
google.protobuf.descriptor._message.default_pool = google.protobuf.descriptor_pool.DescriptorPool()
google.protobuf.symbol_database._DEFAULT = google.protobuf.symbol_database.SymbolDatabase(pool = google.protobuf.descriptor._message.default_pool)
import caffe_pb2 as caffe_pb2
google.protobuf.descriptor._message.default_pool = old_pool
google.protobuf.symbol_database._DEFAULT = old_symdb
sys.modules[__name__ + '.proto'] = sys.modules[__name__]
if shadow_caffe:
sys.modules['caffe'] = sys.modules[__name__]
sys.modules['caffe.proto'] = sys.modules[__name__]
return caffe_pb2
I think it has something to do with the urlopen, but i don't know how to fix this in a docker container, any help would be appreciated, plz.
BTW, I start the container like this:
sudo nvidia-docker run -itv /home/ljh/mobilepose:/home/mobilepose -p 7777:8888 ufoym/deepo:all-py36-jupyter /bin/bash

OSError, Type22Error, and a few other ones that have me stumped

So basically this all stems from a previous question I had, so I'll post that question & my edit in its entirely below:
So I have a script I've been working with for a few days trying to get a list of emails from a csv I have, but now I've run into this roadblock. Here is the code:
import sys
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
import csv
list1 = []
list2 = []
list3 = []
def addList():
with open('file.csv', 'rt') as f:
reader = csv.reader(f)
for row in reader:
for s in row:
list2.append(s)
def getAddress(url):
http = "http://"
https = "https://"
if http in url:
return url
elif https in url:
return url
else:
url = "http://" + url
return url
def parseAddress(url):
global list3
try:
website = urllib2.urlopen(getAddress(url))
html = website.read()
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
global list1
list1.append(addys)
except urllib2.HTTPError as err:
print ("Cannot retrieve URL: HTTP Error Code: "), err.code
list3.append(url)
except urllib2.URLError as err:
print ("Cannot retrive URL: ") + err.reason[1]
list3.append(url)
def execute():
global list2
addList()
totalNum = len(list2)
atNum = 1
for s in list2:
parseAddress(s)
print ("Processing ") + str(atNum) + (" out of ") + str(totalNum)
atNum = atNum + 1
print ("Completed. Emails parsed: ") + str(len(list1)) + "."
### MAIN
def main():
global list2
execute()
global list1
myFile = open("finishedFile.csv", "w+")
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
for s in list1:
wr.writerow(s)
myFile.close
global list3
failFile = open("failedSites.csv", "w+")
write = csv.writer(failFile, quoting=csv.QUOTE_ALL)
for j in list3:
write.writerow(j)
failFile.close
main()
and when I run it I get this error:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 42, in parseAddress
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*#(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
File "/usr/lib/python3.5/re.py", line 213, in findall
return _compile(pattern, flags).findall(string)
TypeError: cannot use a string pattern on a bytes-like object
So I've figured out that I need to figure out how to encode the html string into bytes for the encoding, and Tyler's answer below helped me do so but now I'm getting this error:
Traceback (most recent call last):
File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.5/http/client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.5/http/client.py", line 1152, in _send_request
self.endheaders(body)
File "/usr/lib/python3.5/http/client.py", line 1103, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/usr/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/usr/lib/python3.5/http/client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/lib/python3.5/socket.py", line 712, in create_connection
raise err
File "/usr/lib/python3.5/socket.py", line 703, in create_connection
sock.connect(sa)
OSError: [Errno 22] Invalid argument
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 39, in parseAddress
website = urllib2.urlopen(getAddress(url))
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 22] Invalid argument>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 51, in parseAddress
print ("Cannot retrive URL: ") + err.reason[1]
TypeError: 'OSError' object is not subscriptable
Does this mean that one of the urls from the list isn't a valid url? I thought I had finally removed all fo the bad urls from my csv file but I may need to take another look

problem with code in Pycharm - CERTIFICATE_VERIFY_FAILED

I work on windows 7
this is my code
def Rup(x, y, w):
odleglosc = np.dot(x,w)-y
cost = np.sum(odleglosc**2) / (2*np.shape(x)[0])
return odleglosc, cost
def REG(data_1, data_2, data_3, Tu, cou):
i = 0
while i < cou:
dif, cost = Rup(data_1, data_2, data_3)
grad = np.dot(data_1.transpose(), dif) / np.shape(data_1)[0]
data_3 = data_3 - Tu * grad
if i%200==0:
print('Wyliczony error w ' + str(i) + " iteracji: ", cost)
i+=1;
return data_3
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
_DANE = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data';
iris = pd.read_csv(_DANE, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
le = LabelEncoder()
iris['label'] = le.fit_transform(iris['label'])
X = np.array(iris.drop(['petal_width'], axis=1))
y = np.array(iris['petal_width'])
iris.head()
cros = 1/1000
coun= 10000
_, features = np.shape(X)
wagi = np.zeros(features)
wektor = REG(X, y, wagi, cros, coun)
print('--------------------------------------------------')
print(wektor)
print('--------------------------------------------------')
dif, cost = Rup(X, y, wektor)
print('Szukany Error', cost)
the error message looks as follows
Traceback (most recent call last):
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 1319, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1230, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1276, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1225, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1004, in _send_output
self.send(msg)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 944, in send
self.connect()
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1399, in connect
self.sock = self._context.wrap_socket(self.sock,
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 1040, in _create
self.do_handshake()
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1108)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/lukasz/PycharmProjects/miw/test.py", line 26, in
iris = pd.read_csv(_DANE, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label'])
File "C:\Users\lukasz\PycharmProjects\miw\venv\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\lukasz\PycharmProjects\miw\venv\lib\site-packages\pandas\io\parsers.py", line 439, in _read
fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
File "C:\Users\lukasz\PycharmProjects\miw\venv\lib\site-packages\pandas\io\common.py", line 196, in get_filepath_or_buffer
req = urlopen(filepath_or_buffer)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 502, in _call_chain
result = func(*args)
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 1362, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "C:\Users\lukasz\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 1322, in do_open
raise URLError(err)
urllib.error.URLError:
Process finished with exit code 1
with the code should be all right because it works correctly on the pages of the online compiler
I don't know how to deal with this problem
please help me
You can provide an SSL context and disable the verification.
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

Python - Beautiful Soup Returning Errors

I want to extract the covers for different journals on the cambridge university press website. The I want to save it as it's online ISSN. The following code works but after one or two journals, it gives me this error:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 141, in _new_conn
(self.host, self.port), self.timeout, **extra_kw)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\conne
ction.py", line 60, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\socket.py", line 745, in getaddr
info
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11004] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 601, in urlopen
chunked=chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 357, in _make_request
conn.request(method, url, **httplib_request_kw)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1239, in r
equest
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1285, in _
send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1234, in e
ndheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 1026, in _
send_output
self.send(msg)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\http\client.py", line 964, in se
nd
self.connect()
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 166, in connect
conn = self._new_conn()
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
.py", line 150, in _new_conn
self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x030DB770>: Fai
led to establish a new connection: [Errno 11004] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.
py", line 440, in send
timeout=timeout
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\connection
pool.py", line 639, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\urllib3\util\retry
.py", line 388, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='ore', port=80): Max retries exceeded with
url: /services/aop-file-manager/file/57f386d3efeebb2f18eac486 (Caused by NewConnectionError('<urlli
b3.connection.HTTPConnection object at 0x030DB770>: Failed to establish a new connection: [Errno 110
04] getaddrinfo failed',))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Boys\Documents\Python\python_work\Kudos\CUPgetcovers.py", line 19, in <module>
f.write(requests.get("http://" + imagefound).content)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py",
line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\api.py",
line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.
py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\sessions.
py", line 618, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Boys\AppData\Local\Programs\Python\Python36-32\lib\site-packages\requests\adapters.
py", line 508, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='ore', port=80): Max retries exceeded w
ith url: /services/aop-file-manager/file/57f386d3efeebb2f18eac486 (Caused by NewConnectionError('<ur
llib3.connection.HTTPConnection object at 0x030DB770>: Failed to establish a new connection: [Errno
11004] getaddrinfo failed',))
Process returned 1 (0x1) execution time : 4.373 s
Press any key to continue . . .
What am I doing wrong? I could not find any answers on google. It was working fine before.
Thank you in advance.
Edit:
launch.py:
import httplib2
from bs4 import BeautifulSoup, SoupStrainer
import csv
import requests
from time import sleep
with open('listoflinks.csv', encoding="utf8") as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
http = httplib2.Http()
status, response = http.request(("https://www.cambridge.org" + row[0]))
soup = BeautifulSoup(response, "html.parser")
txt = (t.text for t in soup.find_all("span", class_="value"))
issn = next(t[:9] for t in txt if t.endswith("(Online)"))
for a in soup.find_all('a', attrs={'class' : 'image'}):
if a.img:
imagefound = (a.img['src'])
imagefound = imagefound[2:]
f = open((issn + ".jpg"),'wb')
f.write(requests.get("http://" + imagefound).content)
f.close()
listoflinks.csv:
/core/journals/journal-of-materials-research
/core/journals/journal-of-mechanics
/core/journals/journal-of-modern-african-studies
/core/journals/journal-of-navigation
/core/journals/journal-of-nutritional-science
/core/journals/journal-of-pacific-rim-psychology
/core/journals/journal-of-paleontology
/core/journals/journal-of-pension-economics-and-finance
/core/journals/journal-of-plasma-physics
/core/journals/journal-of-policy-history
/core/journals/journal-of-psychologists-and-counsellors-in-schools
/core/journals/journal-of-public-policy
/core/journals/journal-of-race-ethnicity-and-politics
/core/journals/journal-of-radiotherapy-in-practice
/core/journals/journal-of-relationships-research
/core/journals/journal-of-roman-archaeology
/core/journals/journal-of-roman-studies
/core/journals/journal-of-smoking-cessation
/core/journals/journal-of-social-policy
/core/journals/journal-of-southeast-asian-studies
/core/journals/journal-of-symbolic-logic
/core/journals/journal-of-the-american-philosophical-association
/core/journals/journal-of-the-australian-mathematical-society
/core/journals/journal-of-the-gilded-age-and-progressive-era
/core/journals/journal-of-the-history-of-economic-thought
/core/journals/journal-of-the-institute-of-mathematics-of-jussieu
/core/journals/journal-of-the-international-neuropsychological-society
/core/journals/journal-of-the-international-phonetic-association
/core/journals/journal-of-the-marine-biological-association-of-the-united-kingdom
/core/journals/journal-of-the-royal-asiatic-society
/core/journals/journal-of-the-society-for-american-music
/core/journals/journal-of-tropical-ecology
/core/journals/journal-of-tropical-psychology
/core/journals/journal-of-wine-economics
/core/journals/kantian-review
/core/journals/knowledge-engineering-review
/core/journals/language-and-cognition
/core/journals/language-in-society
/core/journals/language-teaching
/core/journals/language-variation-and-change
/core/journals/laser-and-particle-beams
/core/journals/latin-american-antiquity
/core/journals/latin-american-politics-and-society
/core/journals/law-and-history-review
/core/journals/legal-information-management
/core/journals/legal-studies
/core/journals/legal-theory
/core/journals/leiden-journal-of-international-law
/core/journals/libyan-studies
/core/journals/lichenologist
/core/journals/lms-journal-of-computation-and-mathematics
/core/journals/macroeconomic-dynamics
/core/journals/management-and-organization-review
/core/journals/mathematical-gazette
/core/journals/mathematical-proceedings-of-the-cambridge-philosophical-society
/core/journals/mathematical-structures-in-computer-science
/core/journals/mathematika
/core/journals/medical-history
/core/journals/medical-history-supplements
/core/journals/melanges-d-histoire-sociale
/core/journals/microscopy-and-microanalysis
/core/journals/microscopy-today
/core/journals/mineralogical-magazine
/core/journals/modern-american-history
/core/journals/modern-asian-studies
/core/journals/modern-intellectual-history
/core/journals/modern-italy
/core/journals/mrs-advances
/core/journals/mrs-bulletin
/core/journals/mrs-communications
/core/journals/mrs-energy-and-sustainability
/core/journals/mrs-online-proceedings-library-archive
/core/journals/nagoya-mathematical-journal
/core/journals/natural-language-engineering
/core/journals/netherlands-journal-of-geosciences
/core/journals/network-science
/core/journals/new-perspectives-on-turkey
/core/journals/new-surveys-in-the-classics
/core/journals/new-testament-studies
/core/journals/new-theatre-quarterly
/core/journals/nineteenth-century-music-review
/core/journals/nordic-journal-of-linguistics
/core/journals/numerical-mathematics-theory-methods-and-applications
/core/journals/nutrition-research-reviews
/core/journals/organised-sound
/core/journals/oryx
/core/journals/paleobiology
/core/journals/the-paleontological-society-papers
/core/journals/palliative-and-supportive-care
/core/journals/papers-of-the-british-school-at-rome
/core/journals/parasitology
/core/journals/parasitology-open
/core/journals/personality-neuroscience
/core/journals/perspectives-on-politics
/core/journals/philosophy
/core/journals/phonology
/core/journals/plainsong-and-medieval-music
/core/journals/plant-genetic-resources
/core/journals/polar-record
/core/journals/political-analysis
/core/journals/political-science-research-and-methods
/core/journals/politics-and-gender
/core/journals/politics-and-religion
/core/journals/politics-and-the-life-sciences
/core/journals/popular-music
/core/journals/powder-diffraction
/core/journals/prehospital-and-disaster-medicine
/core/journals/primary-health-care-research-and-development
/core/journals/probability-in-the-engineering-and-informational-sciences
/core/journals/proceedings-of-the-asil-annual-meeting
/core/journals/proceedings-of-the-edinburgh-mathematical-society
/core/journals/proceedings-of-the-international-astronomical-union
/core/journals/proceedings-of-the-nutrition-society
/core/journals/proceedings-of-the-prehistoric-society
/core/journals/proceedings-of-the-royal-society-of-edinburgh-section-a-mathematics
/core/journals/ps-political-science-and-politics
/core/journals/psychological-medicine
/core/journals/public-health-nutrition
/core/journals/publications-of-the-astronomical-society-of-australia
/core/journals/quarterly-reviews-of-biophysics
/core/journals/quaternary-research
/core/journals/queensland-review
/core/journals/radiocarbon
/core/journals/ramus
/core/journals/recall
/core/journals/religious-studies
/core/journals/renewable-agriculture-and-food-systems
/core/journals/review-of-international-studies
/core/journals/review-of-middle-east-studies
/core/journals/review-of-politics
/core/journals/review-of-symbolic-logic
/core/journals/revista-de-historia-economica-journal-of-iberian-and-latin-american-economic-history
/core/journals/robotica
/core/journals/royal-historical-society-camden-fifth-series
/core/journals/royal-institute-of-philosophy-supplements
/core/journals/rural-history
/core/journals/science-in-context
/core/journals/scottish-journal-of-theology
/core/journals/seed-science-research
/core/journals/slavic-review
/core/journals/social-philosophy-and-policy
/core/journals/social-policy-and-society
/core/journals/social-science-history
/core/journals/spanish-journal-of-psychology
/core/journals/studies-in-american-political-development
/core/journals/studies-in-church-history
/core/journals/studies-in-second-language-acquisition
/core/journals/tempo
/core/journals/theatre-research-international
/core/journals/theatre-survey
/core/journals/theory-and-practice-of-logic-programming
/core/journals/think
/core/journals/traditio
/core/journals/trans-trans-regional-and-national-studies-of-southeast-asia
/core/journals/transactions-of-the-royal-historical-society
/core/journals/transnational-environmental-law
/core/journals/twentieth-century-music
/core/journals/twin-research-and-human-genetics
/core/journals/urban-history
/core/journals/utilitas
/core/journals/victorian-literature-and-culture
/core/journals/visual-neuroscience
/core/journals/weed-science
/core/journals/weed-technology
/core/journals/wireless-power-transfer
/core/journals/world-politics
/core/journals/world-s-poultry-science-journal
/core/journals/world-trade-review
/core/journals/zygote
You should simplify your code and your scraping strategy, although I can see that not all journal pages have the same structure. On most pages you can get the ISSN easily through a form value. On others (free access, I think) you need to apply some kind of heuristics to get the ISSN. Also I don't know why you are using httplib2 and requests as both provide more or less the same functionality. Anyway here's some code that does what you want ... kind of (I have also removed the CSV code because as it is there's not need for that):
import requests
from bs4 import BeautifulSoup, SoupStrainer
with open('listoflinks.csv', encoding="utf8") as f:
for line in f:
path = line.strip()
print("getting", path)
response = requests.get("https://www.cambridge.org" + path)
soup = BeautifulSoup(response.text, "html.parser")
try:
issn = soup.find("input", attrs={'name': 'productIssn'}).get('value')
except:
values = soup.find_all("span", class_="value")
for v in values:
if "(Online)" in v.string:
issn = v.string.split(" ")[0]
break
print("issn:", issn)
details_container = soup.find("div", class_="details-container")
image = details_container.find("img")
imgurl = image['src'][2:]
print("imgurl:", imgurl)
with open(issn + ".jpg", 'wb') as output:
output.write(requests.get("http://" + imgurl).content)

Urllib error when using BioPython

I am currently working on a project for which I need to download a few thousand citations from PubMed. I am currently using BioPython and have written this code:
from Bio import Entrez
from Bio import Medline
from pandas import *
from sys import argv
import os
Entrez.email = "email"
df = read_csv("/Users/.../Desktop/sr_dataset/adhd/excluded/adhdExcluded.csv")
i=0
withoutMesh = 0
withoutMeshID = ""
withoutAbstract = 0
withoutAbstractID = ""
path = '/Users/.../Desktop/sr_dataset/adhd/excluded'
for index, row in df.iterrows():
print (row.id)
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(row.id))
records = Medline.parse(handle)
for record in records:
try:
abstract = str(record["AB"])
except:
abstract = "none"
withoutAbstract = withoutAbstract +1
withoutAbstractID = withoutAbstractID + str(row.id) + "\n"
try:
title = str(record["TI"])
except:
title = "none"
try:
mesh = str(record["MH"])
except:
mesh = "none"
withoutMesh = withoutMesh +1
withoutMeshID = withoutMeshID + str(row.id) + "\n"
filename= str(row.id) + '.txt'
filename = os.path.join(path, filename)
file = open(filename, "w")
output = "title: "+str(title) + "\n\n" + "abstract: "+str(abstract) + "\n\n" + "mesh: "+str(mesh) + "\n\n"
file.write(output)
file.close()
print (i)
i=i+1
filename = os.path.join(path, "overview.txt")
file = open(filename, "w")
output = "Without MeSH terms:" + str(withoutMesh) + "\n" + "ID's: "+str(withoutMeshID) + "\n\n" + "Without abstract: "+str(withoutAbstract) + "\n" + "ID's: "+str(withoutAbstractID)
file.write(output)
file.close()
The code works for the first few hundred rows in the table but then stops executing and the error I receive is:
Traceback (most recent call last):
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1106, in request
self._send_request(method, url, body, headers)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1151, in _send_request
self.endheaders(body)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1102, in endheaders
self._send_output(message_body)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/Users/.../anaconda/lib/python3.5/http/client.py", line 1260, in connect
server_hostname=server_hostname)
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 377, in wrap_socket
_context=self)
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 752, in __init__
self.do_handshake()
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 988, in do_handshake
self._sslobj.do_handshake()
File "/Users/.../anaconda/lib/python3.5/ssl.py", line 633, in do_handshake
self._sslobj.do_handshake()
ConnectionResetError: [Errno 54] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/.../Desktop/sr_dataset/ace_inhibitor/excluded/pumbedMedline.py", line 18, in <module>
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(row.id))
File "/Users/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 180, in efetch
return _open(cgi, variables, post=post)
File "/Users/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 524, in _open
handle = _urlopen(cgi)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 1297, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 54] Connection reset by peer>
Here are the first few columns of the CSV file:
id
10029645
10073846
10078088
10080457
10088066
...
Biopython does follow the "up to three queries per second rule" to avoid abusing the NCBI servers, but you have have missed the first bullet point in our tutorial http://biopython.org/DIST/docs/tutorial/Tutorial.html on the guidelines:
"For any series of more than 100 requests, do this at weekends or
outside USA peak times. This is up to you to obey."
That said, sometimes you will get intermittent errors from Entrez, and using a try/except block to handle this with a retry is suggested. There is an example in the tutorial.

Categories