404: [33mGET /search?q=books HTTP/1.1[0m" 404 - - python

I am new to the web-scraping and making an API, facing an error while scraping an e-commerce website. Below is my python code please guide me through the same, I am getting "The requested URL was not found on the server." while running on a local-host.
from flask import Flask , request , jsonify
from bs4 import BeautifulSoup
import requests
app = Flask(__name__)
#app.route('/',methods=['GET'])
def API():
if request.method == 'GET':
uri = 'https://www.flipkart.com'
query = str(request.args['query'])
print(query)
if " " in query:
query = str(query).replace(" ","+")
else:
pass
search = '/search?q=' + query
ready_uri = uri + search
print(ready_uri)
content = requests.get(ready_uri).content
soup = BeautifulSoup(content, 'html.parser')
quotes_links = soup.find_all('a', {'class': '_3O0U0u'})
l = []
for i in quotes_links:
d = {}
quote_url = uri + i.get('href')
quote_content = requests.get(quote_url).content
quote_soup = BeautifulSoup(quote_content, 'html.parser')
d['quote'] = quote_soup.find('p', {'class': '_3wU53n'}).text
d['author'] = quote_soup.find('p', {'class': '_1vC4OE _2rQ-NK'}).text
l.append(d)
return jsonify(l)
if __name__ == '__main__':
app.run()
Error:
[33mGET /search?q=books HTTP/1.1[0m" 404 -

How do you get a query string on Flask?
You appear to be getting the query argument incorrectly.
query = str(request.args['query'])
When it should be:
query = str(request.args.get('query'))
Doing so returns a 200 but with blank data. I would suggest looking at the element your scraping:
quotes_links = soup.find_all('a', {'class': '_3O0U0u'})
Once you obtain the correct element with soup, you should start seeing return data.

Related

How to extract anchor class using beautifulsoup?

Can anyone help me to resolve the error. My head is paining, I have wasted my 1 hour in solving this problem. Actually I am getting query_links as null(I should get all the classes values), but not so.
from flask import Flask, jsonify, request
# from markupsafe import escape
from bs4 import BeautifulSoup
import requests
app = Flask(__name__)
#app.route('/api/',methods=['GET'])
def API():
if request.method == 'GET':
url = 'https://www.brainyquote.com/'
query = str(request.args['query'])
if " " in query:
query = str(query).replace(" ","+")
else:
pass
search = '/search_results?q=' + query
ready_url = url + search
content = requests.get(ready_url).content
soup = BeautifulSoup(content, 'html.parser')
quotes_links = soup.find_all("a", class_= "b-qt")
print("hello")
print(quotes_links)
list = []
for i in quotes_links:
d = {}
quote_url = url + i.get('href')
quote_content = requests.get(quote_url).content
quote_soup = BeautifulSoup(quote_content, 'html.parser')
d['quote'] = quote_soup.find('p', class_= "b-qt").text
d['author'] = str(quote_soup.find('p', class_= "bq-aut").text).strip()
list.append(d)
return jsonify(list)
if __name__ == "__main__":
app.run(debug=True)
Please help me. Why I am not getting any value in json. My list is empty. And also Query_links is null. Is there any syntax mistake or anything else?
Your ready_url variable ends up having a double slash in it (i.e. https://www.brainyquote.com//search_results?q=testing). If you test that in a browser or with curl, you'll see that yields no results. If you fix the definition of url so it doesn't have the trailing slash (i.e. url='https://www.brainyquote.com'), your code will work.

Problems with data retrieving using Python web scraping

I wrote a simple code for scraping data from a web page but I mention all the thing like object class with tag but my program does not scrape data. One more thing there is an email that I also want to scrape but not know how to mention its id or class. Could you please guide me - how can I fix this issue? Thanks!
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('hi',class_="page-header",id=False).text
except:
title = 'empty'
print(title)
try:
email = soup.find('',class_="",id=False).text
except:
email = 'empty'
print(email)
def main():
url = "https://www.igrc.org/clergydetail/2747164"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
As noticed the value of the email is not in plain text. The html is loaded via JS in a script tag :
<script type="text/javascript">document.write(String.fromCharCode(60,97,32,104,114,101,102,61,34,35,34,32,115,116,121,108,101,61,34,117,110,105,99,111,100,101,45,98,105,100,105,58,98,105,100,105,45,111,118,101,114,114,105,100,101,59,100,105,114,101,99,116,105,111,110,58,114,116,108,59,34,32,111,110,99,108,105,99,107,61,34,116,104,105,115,46,104,114,101,102,61,83,116,114,105,110,103,46,102,114,111,109,67,104,97,114,67,111,100,101,40,49,48,57,44,57,55,44,49,48,53,44,49,48,56,44,49,49,54,44,49,49,49,44,53,56,44,49,49,52,44,49,49,49,44,57,56,44,54,52,44,49,48,57,44,49,48,49,44,49,49,54,44,49,48,52,44,49,49,49,44,49,48,48,44,49,48,53,44,49,49,53,44,49,49,54,44,52,54,44,57,57,44,57,57,41,59,34,62,38,35,57,57,59,38,35,57,57,59,38,35,52,54,59,38,35,49,49,54,59,38,35,49,49,53,59,38,35,49,48,53,59,38,35,49,48,48,59,38,35,49,49,49,59,38,35,49,48,52,59,38,35,49,49,54,59,38,35,49,48,49,59,38,35,49,48,57,59,38,35,54,52,59,38,35,57,56,59,38,35,49,49,49,59,38,35,49,49,52,59,60,47,97,62));</script>
which contains all the characters code (ascii code). When decoded will gives :
cc.tsidohtem#bor
which needs to be decoded too. We just needs the mailto which is present in onclick (the content in the mailto is unchanged whereas the text of the a tag is reversed (using direction: rtl as noticed by Hugo) :
mailto:john#doe.inc
The following python code extracts the mail :
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.igrc.org/clergydetail/2747164")
soup = BeautifulSoup(r.text, 'html.parser')
titleContainer = soup.find(class_ = "page-header")
title = titleContainer.text.strip() if titleContainer else "empty"
emailScript = titleContainer.findNext("script").text
def parse(data):
res = re.search('\(([\d+,]*)\)', data, re.IGNORECASE)
return "".join([
chr(int(i))
for i in res.group(1).split(",")
])
emailData1 = parse(emailScript)
email = parse(emailData1)
print(title)
print(email.split(":")[1])
One could reproduce this encoding the other way around using the following code :
def encode(data):
return ",".join([str(ord(i)) for i in data])
mail = "john#doe.inc"
encodedMailTo = encode("mailto:" + mail)
encodedHtmlEmail = "".join(["&#" + str(ord(i)) + ";" for i in mail])
htmlContainer = f'{encodedHtmlEmail}'
encodedHtmlContainer = encode(htmlContainer)
scriptContainer = f'<script type="text/javascript">document.write(String.fromCharCode({encodedHtmlContainer}));</script>'
print(scriptContainer)

why am I still getting results as html using beautifulsoup?

I am writing a simple scraper for job postings, but my function extract_fulltext which is responsible for giving all the job description, however, I still got html tags as a response, so in this case. it is giving me raise InvalidSchema("No connection adapters were found for '%s'" % url) , and the full issue bug https://gist.github.com/SkyBulk/c6df488ef53ae6bc62c86670cfbd09ec
def extract_fulltext(url):
html = requests.get(url)
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html.text))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.parse.quote(job_ids)
ajax_content = requests.get(ajax_url)
soup = BeautifulSoup(ajax_content.text,"lxml")
text = soup.getText()
return soup
response = requests.get(url, headers=self.headers)
data = response.text
soup = get_soup(data)
html = soup.find_all(
name="div", attrs={"class": "row"})
for page in html:
print(page)
prefix = ['30', 'monaten', 'meses', 'luni', 'mois', 'month', 'months', 'maanden',
'mesi', 'mies.', 'm\u00e5nader', '\u043c\u0435\u0441\u044f\u0446\u0435\u0432']
date_str = extract_date(page)
s_date = date_str.replace('+', '')
match = [prefix_match for prefix_match in prefix if prefix_match in s_date]
if len(match) > 0:
pass
elif "NOT_FOUND" in s_date:
pass
else:
self.data_extracted['jobs'].append({
'job_title': extract_job_title(page),
'company': extract_company(page),
'city': extract_location(page),
'date': extract_date(page),
'cleared': extract_fulltext(page),
'url': [self.urls[country] + extract_link(page)]
})
I expect the output of {"job_id": "description"} , but the actual output is an error
You can do your concept depending on this solution
import requests,json
from bs4 import BeautifulSoup
req = requests.get('https://www.indeed.com/rpc/jobdescs?jks=80635306093cf18a,7496998d9ee18bdc')
data = json.loads(req.text)
for id in data.keys():
soup = BeautifulSoup(data[id])
print(soup.text)
Demo : Here
Simply use .get_text():
def extract_fulltext(url):
html = requests.get(url)
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html.text))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.parse.quote(job_ids)
ajax_content = requests.get(ajax_url)
soup = BeautifulSoup(ajax_content.text,"lxml")
text = soup.get_text()
return text

web-scraping with python 3.x and I use pytharm

I want to get incubator information by web-scraping, and I use python.but I get nothing after running my code. Here are my code.Need your help!
import requests
from requests.exceptions import RequestException
import re
def get_one_page(url):
try:
r = requests.get(url)
if r.status_code == 200:
return r.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('f14px c-blue.*?><a.*?>(.*?)</a>.*?fn14px c-666>(.*?)</td>')
items = re.findall(pattern, html)
for item in items:
yield {
'name': item[0],
'address': item[1]
}
def main(offset):
url = 'http://www.cnfuhuaqi.com/couveuse/0-0-0-0-0-d%.aspx' % offset
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
if __name__ == '__main__':
for i in range(2, 72):
main(i)
Never parse html with regex, use an html parser such as BeautifulSoup. In your case, you only need to select the element with zjfw-list-con class and extract the tables inside it. The following will extract the image src url, the link and the description for 2 iterations (2 and 3):
from bs4 import BeautifulSoup
import requests
incubators = []
def extract_data(url):
print("get data from {}".format(url))
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
tables = soup.find_all("div", {"class":"zjfw-list-con"})[0].find_all("table")
for table in tables:
for subtable in table.find_all('table'):
items = subtable.find('tr').find_all('td')
item_tuple = (
items[0].find('img')['src'],
items[1].find('a')['href'],
items[2].text.strip()
)
print(item_tuple)
incubators.append(item_tuple)
url = 'http://www.cnfuhuaqi.com/couveuse/0-0-0-0-0-%d.aspx'
for i in range(2, 4):
extract_data(url % i)
print("the full list : ")
for i in incubators:
print(' '.join(i))

How to obtain all the links in a domain using Python?

I want to use Python to obtain all the links in a domain given the 'root' URL (in a list). Suppose given a URL http://www.example.com this should return all the links on this page of the same domain as the root URL, then recurse on each of these links visiting them and extracting all the links of the same domain and so on. What I mean by same domain is if given http://www.example.com the only links I want back are http://www.example.com/something, http://www.example.com/somethingelse ... Anything external such as http://www.otherwebsite.com should be discarded. How can I do this using Python?
EDIT: I made an attempt using lxml. I don't think this works fully, and I am not sure how to take into account links to already processed pages (causing infinite loop).
import urllib
import lxml.html
#given a url returns list of all sublinks within the same domain
def getLinks(url):
urlList = []
urlList.append(url)
sublinks = getSubLinks(url)
for link in sublinks:
absolute = url+'/'+link
urlList.extend(getLinks(absolute))
return urlList
#determine whether two links are within the same domain
def sameDomain(url, dom):
return url.startswith(dom)
#get tree of sublinks in same domain, url is root
def getSubLinks(url):
sublinks = []
connection = urllib.urlopen(url)
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath('//a/#href'):
if not (link.startswith('#') or link.startswith('http') or link.startswith('mailto:')):
sublinks.append(link)
return sublinks
~
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content)
return soup
def get_status_code(link):
"""
Return the error code for any url
param: link
"""
try:
error_code = requests.get(link).status_code
except requests.exceptions.ConnectionError:
error_code =
return error_code
def find_internal_urls(lufthansa_url, depth=0, max_depth=2):
all_urls_info = []
status_dict = {}
soup = get_soup(lufthansa_url)
a_tags = soup.findAll("a", href=True)
if depth > max_depth:
return {}
else:
for a_tag in a_tags:
if "http" not in a_tag["href"] and "/" in a_tag["href"]:
url = "http://www.lufthansa.com" + a_tag['href']
elif "http" in a_tag["href"]:
url = a_tag["href"]
else:
continue
status_dict["url"] = url
status_dict["status_code"] = get_status_code(url)
status_dict["timestamp"] = datetime.now()
status_dict["depth"] = depth + 1
all_urls_info.append(status_dict)
return all_urls_info
if __name__ == "__main__":
depth = 2 # suppose
all_page_urls = find_internal_urls("someurl", 2, 2)
if depth > 1:
for status_dict in all_page_urls:
find_internal_urls(status_dict['url'])
The above snippet contains necessary modules for scrapping urls from lufthansa arlines website. The only thing additional here is you can specify depth to which you want to scrape recursively.
Here is what I've done, only following full urls like http://domain[xxx]. Quick but a bit dirty.
import requests
import re
domain = u"stackoverflow.com"
http_re = re.compile(u"(http:\/\/" + domain + "[\/\w \.-]*\/?)")
visited = set([])
def visit (url):
visited.add (url)
extracted_body = requests.get (url).text
matches = re.findall (http_re, extracted_body)
for match in matches:
if match not in visited :
visit (match)
visit(u"http://" + domain)
print (visited)
There are some bugs in the code of #namita . I modify it and it works well now.
import sys
import requests
import hashlib
from bs4 import BeautifulSoup
from datetime import datetime
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content, "lxml")
return soup
def get_status_code(link):
"""
Return the error code for any url
param: link
"""
try:
error_code = requests.get(link).status_code
except requests.exceptions.ConnectionError:
error_code = -1
return error_code
def find_internal_urls(main_url, depth=0, max_depth=2):
all_urls_info = []
soup = get_soup(main_url)
a_tags = soup.findAll("a", href=True)
if main_url.endswith("/"):
domain = main_url
else:
domain = "/".join(main_url.split("/")[:-1])
print(domain)
if depth > max_depth:
return {}
else:
for a_tag in a_tags:
if "http://" not in a_tag["href"] and "https://" not in a_tag["href"] and "/" in a_tag["href"]:
url = domain + a_tag['href']
elif "http://" in a_tag["href"] or "https://" in a_tag["href"]:
url = a_tag["href"]
else:
continue
# print(url)
status_dict = {}
status_dict["url"] = url
status_dict["status_code"] = get_status_code(url)
status_dict["timestamp"] = datetime.now()
status_dict["depth"] = depth + 1
all_urls_info.append(status_dict)
return all_urls_info
if __name__ == "__main__":
url = # your domain here
depth = 1
all_page_urls = find_internal_urls(url, 0, 2)
# print("\n\n",all_page_urls)
if depth > 1:
for status_dict in all_page_urls:
find_internal_urls(status_dict['url'])
The code worked, but I don't know if it's 100% correct
it is extracting all the internal urls in the website
import requests
from bs4 import BeautifulSoup
def get_soup(link):
"""
Return the BeautifulSoup object for input link
"""
request_object = requests.get(link, auth=('user', 'pass'))
soup = BeautifulSoup(request_object.content, "lxml")
return soup
visited = set([])
def visit (url,domain):
visited.add (url)
soup = get_soup(url)
a_tags = soup.findAll("a", href=True)
for a_tag in a_tags:
if "http://" not in a_tag["href"] and "https://" not in a_tag["href"] and "/" in a_tag["href"]:
url = domain + a_tag['href']
elif "http://" in a_tag["href"] or "https://" in a_tag["href"]:
url = a_tag["href"]
else:
continue
if url not in visited and domain in url:
# print(url)
visit (url,domain)
url=input("Url: ")
domain=input("domain: ")
visit(u"" + url,domain)
print (visited)
From the tags of your question, I assume you are using Beautiful Soup.
At first, you obviously need to download the webpage, for example with urllib.request. After you did that and have the contents in a string, you pass it to Beautiful Soup. After that, you can find all links with soup.find_all('a'), assuming soup is your beautiful soup object. After that, you simply need to check the hrefs:
The most simple version would be to just check if "http://www.example.com" is in the href, but that won't catch relative links. I guess some wild regular expression would do (find everything with "www.example.com" or starting with "/" or starting with "?" (php)), or you might look for everything that contains a www, but is not www.example.com and discard it, etc. The correct strategy might be depending on the website you are scraping, and it's coding style.
You can use regular expression to filter out such links
eg
<a\shref\=\"(http\:\/\/example\.com[^\"]*)\"
Take the above regex as reference and start writing script based on that.

Categories