I am trying to scrape content from a website but I am getting the below mentioned error
The method:
def scrape_newtimes():
"""Scrapes content from the NewTimes"""
url = 'https://www.newtimes.co.rw/'
r = requests.get(url, headers=HEADERS)
tree = fromstring(r.content)
links = tree.xpath('//div[#class="x-small-push clearfix"]/a/#href')
for link in links:
r = requests.get(link, headers=HEADERS)
blog_tree = fromstring(r.content)
paras = blog_tree.xpath('//div[#class="article-content"]/p')
para = extract_paratext(paras)
text = extract_text(para)
if not text:
continue
yield '"%s" %s' % (text, link)
The error I am getting:
>>> sc = scrape_newtimes()
>>> string_1 = next(sc)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "D:\Projects\bird\bird-env\bot.py", line 58, in scrape_newtimes
r = requests.get(link, headers=HEADERS)
File "D:\Projects\bird\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "D:\Projects\bird\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Projects\bird\venv\lib\site-packages\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "D:\Projects\bird\venv\lib\site-packages\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "D:\Projects\bird\venv\lib\site-packages\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "D:\Projects\bird\venv\lib\site-packages\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda': No schema supplied. Perhaps you meant http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
>>>
The exception basically tells you what is wrong:
requests.exceptions.MissingSchema: Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda': No schema supplied. Perhaps you meant http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
Or with line wrapping the line:
Invalid URL '/news/londons-kings-college-launch-civil-service-programme-rwanda':
No schema supplied. Perhaps you meant
http:///news/londons-kings-college-launch-civil-service-programme-rwanda?
You link does not contain a complete URL
Related
I am trying to use Reddit's developer API to build a simple scraper that grabs posts and their replies in a target subreddit and produces JSON with the information.
I am getting a 404 error that I don't understand.
This is my code:
import praw
import json
def scrape(subreddit, limit):
r = praw.Reddit(user_agent='Reddit data organizer 1.0 by /u/reallymemorable', client_id='none of your business', client_secret='none of your business')
submissions = r.subreddit(subreddit).get_hot(limit=limit)
for submission in submissions:
data = {}
data['title'] = submission.title
data['score'] = submission.score
data['url'] = submission.url
data['author'] = str(submission.author)
data['subreddit'] = str(submission.subreddit)
data['num_comments'] = submission.num_comments
data['over_18'] = submission.over_18
data['selftext'] = submission.selftext
data['is_self'] = submission.is_self
data['name'] = submission.name
data['created_utc'] = submission.created_utc
data['permalink'] = submission.permalink
data['domain'] = submission.domain
data['id'] = submission.id
data['kind'] = submission.kind
json.dumps(data)
scrape('https://www.reddit.com/r/funny/', 25)
When I run it, I get this:
reallymemorable#Christians-MBP Desktop % python3 fetch-data-subreddit.py
Traceback (most recent call last):
File "/Users/reallymemorable/Desktop/fetch-data-subreddit.py", line 26, in <module>
scrape('https://www.reddit.com/r/augmentedreality/comments/yv7sn8/ar_maximum_distance/', 25)
File "/Users/reallymemorable/Desktop/fetch-data-subreddit.py", line 6, in scrape
submissions = r.subreddit(subreddit).get_hot(limit=limit)
File "/opt/homebrew/lib/python3.9/site-packages/praw/models/reddit/base.py", line 34, in __getattr__
self._fetch()
File "/opt/homebrew/lib/python3.9/site-packages/praw/models/reddit/subreddit.py", line 583, in _fetch
data = self._fetch_data()
File "/opt/homebrew/lib/python3.9/site-packages/praw/models/reddit/subreddit.py", line 580, in _fetch_data
return self._reddit.request(method="GET", params=params, path=path)
File "/opt/homebrew/lib/python3.9/site-packages/praw/util/deprecate_args.py", line 43, in wrapped
return func(**dict(zip(_old_args, args)), **kwargs)
File "/opt/homebrew/lib/python3.9/site-packages/praw/reddit.py", line 941, in request
return self._core.request(
File "/opt/homebrew/lib/python3.9/site-packages/prawcore/sessions.py", line 330, in request
return self._request_with_retries(
File "/opt/homebrew/lib/python3.9/site-packages/prawcore/sessions.py", line 266, in _request_with_retries
raise self.STATUS_EXCEPTIONS[response.status_code](response)
prawcore.exceptions.NotFound: received 404 HTTP response
r.subreddit(subreddit) - subreddit should just be the name of the subreddit e.g. "funny" and not the full URL.
See the docs here: https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#obtain-a-subreddit
This is the logs I received from my unittest for my APIs flask application which is using flask-restful, it showed that I have an int object not iterable error.
Really appreciate if anyone tell me what is actually wrong in my unittest codes :(
.
======================================================================
ERROR: test_brand_create (test.brand.BrandTest)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/Users/david/ITP-Team3/test/brand.py", line 37, in test_brand_create
response = tester.post('/api/brand/', data=json.dumps(payload), headers=headers)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 1140, in post
return self.open(*args, **kw)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/flask/testing.py", line 217, in open
return super().open(
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 1089, in open
response = self.run_wsgi_app(request.environ, buffered=buffered)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 956, in run_wsgi_app
rv = run_wsgi_app(self.application, environ, buffered=buffered)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 1255, in run_wsgi_app
for item in app_iter:
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/wsgi.py", line 462, in __next__
return self._next()
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/wrappers/response.py", line 50, in _iter_encoded
for item in iterable:
TypeError: 'int' object is not iterable
======================================================================
ERROR: test_brand_put (test.brand.BrandTest)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/Users/david/ITP-Team3/test/brand.py", line 49, in test_brand_put
response = tester.put('/api/brand/'+id, data=json.dumps(payload), headers=headers)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 1145, in put
return self.open(*args, **kw)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/flask/testing.py", line 217, in open
return super().open(
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 1089, in open
response = self.run_wsgi_app(request.environ, buffered=buffered)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 956, in run_wsgi_app
rv = run_wsgi_app(self.application, environ, buffered=buffered)
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/test.py", line 1255, in run_wsgi_app
for item in app_iter:
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/wsgi.py", line 462, in __next__
return self._next()
File "/Users/david/.local/share/virtualenvs/ITP-Team3-bpvJtb_C/lib/python3.10/site-packages/werkzeug/wrappers/response.py", line 50, in _iter_encoded
for item in iterable:
TypeError: 'int' object is not iterable
----------------------------------------------------------------------
Ran 11 tests in 2.370s
FAILED (errors=2)
Here's my test code that I had the error on:
# brand.py
def test_brand_create(self):
tester = app.test_client(self)
headers = login(tester)
payload = {'brandName': 'test1'}
response = tester.post('/api/brand/', data=json.dumps(payload), headers=headers)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content_type, 'application/json')
def test_brand_put(self):
tester = app.test_client(self)
headers = login(tester)
payload = {'brandName': 'test'}
print("TYPE", type(get_id(app, Brand, 'test1')))
print("VALUE", get_id(app, Brand, 'test1'))
id = get_id(app, Brand, 'test1')
response = tester.put('/api/brand/'+id, data=json.dumps(payload), headers=headers)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content_type, 'application/json')
and the utils code as well
# utils.py
def login(tester):
headers = {'Content-Type': 'application/json'}
# put user and password in .env file
payload = {'username': os.getenv("user"), 'password_hash': os.getenv("password")}
response = tester.post('/api/user/login/', data= json.dumps(payload), headers=headers)
jwt_str = response.data.decode('utf-8')
header_jwt= ast.literal_eval(jwt_str)
return {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + header_jwt['jwt_token']}
def get_id(app, model, brandName) -> None:
with app.app_context():
test_query = model.query.filter_by(brandName=brandName).first()
id_lookup = model.query.get(test_query.id)
return str(id_lookup.id)
I have taken links from one page (159 totalled) and now want to run them through a loop and get information on each of their pages.
When I do this I am getting an error as one of the links is coming back with an error (I think that is why it is erroring out)
Any advise/direction or help would be greatly appreciated.
Thank you
See code below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl = "https://www.auveco.com"
productlinks = []
r = requests.get('https://www.auveco.com/products')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('li', class_='opened')
for item in productlist:
for link in item.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
#print(productlinks)
#part#2
partno = []
for link in productlinks:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
try:
name = soup.find_all('li', class_='product-code').text
except:
name='nopage'
print(name)
Here is the error I am getting:
Traceback (most recent call last):
File "C:\Program Files\Python39\lib\site-packages\requests\models.py", line 382, in prepare_url
scheme, auth, host, port, path, query, fragment = parse_url(url)
File "C:\Program Files\Python39\lib\site-packages\urllib3\util\url.py", line 392, in parse_url
return six.raise_from(LocationParseError(source_url), None)
File "<string>", line 3, in raise_from
urllib3.exceptions.LocationParseError: Failed to parse: https://www.auveco.comjavascript:void(0);
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\ppluc\PycharmProjects\pythonProject\auv22.py", line 30, in <module>
r = requests.get(link, headers=headers)
File "C:\Program Files\Python39\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Program Files\Python39\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Program Files\Python39\lib\site-packages\requests\sessions.py", line 528, in request
prep = self.prepare_request(req)
File "C:\Program Files\Python39\lib\site-packages\requests\sessions.py", line 456, in prepare_request
p.prepare(
File "C:\Program Files\Python39\lib\site-packages\requests\models.py", line 316, in prepare
self.prepare_url(url, params)
File "C:\Program Files\Python39\lib\site-packages\requests\models.py", line 384, in prepare_url
raise InvalidURL(*e.args)
requests.exceptions.InvalidURL: Failed to parse: https://www.auveco.comjavascript:void(0);
nopage
You can also wrap r = requests.get(link, headers=headers) in a try - except block, like:
try:
r = requests.get(link, headers=headers)
except requests.exceptions.InvalidURL as e:
print(str(e))
Then we can also skip the other parsing steps, if the get fails:
#part#2
partno = []
for link in productlinks:
try:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
name = soup.find_all('li', class_='product-code').text
except requests.exceptions.InvalidURL as e:
print(str(e))
name='url invalid'
except:
name='nopage'
print(name)
For example: I need to post data in such a format: {imgFiles:[(filename, file), (filename, file), (flename file)]
I have tried to do it like this:
pic_array = [
('file1', open("somefile.xml", "r")),
('file2', open("somefile2.xml", "r"))
]
files_pics = [('imgFiles', pic_array)]
r = requests.post(
'https://some.site/path/to/api/point',
data=data_details,
headers=headers_1,
files=files_pics
)
print(r.status_code, r.reason, r.json())
and get a
Traceback (most recent call last):
File "C:/Users/someusername/PycharmProjects/someprojectname/data_load.py", line 115, in <module>
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py", line 116, in post
return request('post', url, data=data, json=json, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\models.py", line 316, in prepare
self.prepare_body(data, files, json)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\models.py", line 504, in prepare_body
(body, content_type) = self._encode_files(files, data)
File "C:\ProgramData\Anaconda3\lib\site-packages\requests\models.py", line 151, in _encode_files
fn, fp, ft, fh = v
ValueError: not enough values to unpack (expected 4, got 1)
Is there any way to post files exactly as array?
You are sending multiple files incorrectly see the following working example. If I take my example and change it to a list like yours I get the same error.
import requests
url = 'http://localhost:8080/'
files = {'file': open('sql.py', 'rb'),
'file2': open('lst.py', 'rb')}
r = requests.post(url, files=files)
print(r.text)
why am I getting an AttributeError: 'int' object has no attribute 'encode'?
I am trying to retrieve a tweet using the Twitter API on Python. Full traceback here:
Traceback (most recent call last):
File "C:/Python27/lol.py", line 34, in <module>
headers = req.to_header()
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 398, in to_header
params_header = ', '.join(header_params)
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 397, in <genexpr>
header_params = ('%s="%s"' % (k, v) for k, v in stringy_params)
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 396, in <genexpr>
stringy_params = ((k, escape(v)) for k, v in oauth_params)
File "build\bdist.win-amd64\egg\oauth2\__init__.py", line 163, in escape
s = s.encode('utf-8')
AttributeError: 'int' object has no attribute 'encode'
Below is the code I'm using.
import oauth2
import time
import urllib2
import json
url1="https://api.twitter.com/1.1/search/tweets.json"
params = {
"oauth_version": "1.9.0",
"oauth_nonce": oauth2.generate_nonce(),
"oauth_timestamp": int(time.time())
}
consumer = oauth2.Consumer(key="*********", secret="*********")
token = oauth2.Token(key="*********", secret="*********")
params["oauth_consumer_key"] = consumer.key
params["oauth_token"] = token.key
for i in range(1):
url = url1
req = oauth2.Request(method="GET", url=url, parameters=params)
signature_method = oauth2.SignatureMethod_HMAC_SHA1()
req.sign_request(signature_method, consumer, token)
headers = req.to_url()
print headers
print url
for i in range(1):
url = url1
params["q"] = "pictorial"
params["count"] = 2
req = oauth2.Request(method="GET", url=url, parameters=params)
signature_method = oauth2.SignatureMethod_HMAC_SHA1()
req.sign_request(signature_method, consumer, token)
headers = req.to_header()
url = req.to_url()
response = urllib2.Request(url)
data = json.load(urllib2.urlopen(response))
if data["statuses"] == []:
print "end of data"
break
else:
print data
And if I change int(time.time()) into str(time.time())
I get the following error:
Traceback (most recent call last):
File "C:/Python27/lol.py", line 37, in <module>
data = json.load(urllib2.urlopen(response))
File "C:\Python27\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 437, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 475, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 400: Bad Request
"oauth_timestamp": int(time.time())
here you use an int, but that field must be a string.