How to extract anchor class using beautifulsoup? - python

Can anyone help me to resolve the error. My head is paining, I have wasted my 1 hour in solving this problem. Actually I am getting query_links as null(I should get all the classes values), but not so.
from flask import Flask, jsonify, request
# from markupsafe import escape
from bs4 import BeautifulSoup
import requests
app = Flask(__name__)
#app.route('/api/',methods=['GET'])
def API():
if request.method == 'GET':
url = 'https://www.brainyquote.com/'
query = str(request.args['query'])
if " " in query:
query = str(query).replace(" ","+")
else:
pass
search = '/search_results?q=' + query
ready_url = url + search
content = requests.get(ready_url).content
soup = BeautifulSoup(content, 'html.parser')
quotes_links = soup.find_all("a", class_= "b-qt")
print("hello")
print(quotes_links)
list = []
for i in quotes_links:
d = {}
quote_url = url + i.get('href')
quote_content = requests.get(quote_url).content
quote_soup = BeautifulSoup(quote_content, 'html.parser')
d['quote'] = quote_soup.find('p', class_= "b-qt").text
d['author'] = str(quote_soup.find('p', class_= "bq-aut").text).strip()
list.append(d)
return jsonify(list)
if __name__ == "__main__":
app.run(debug=True)
Please help me. Why I am not getting any value in json. My list is empty. And also Query_links is null. Is there any syntax mistake or anything else?

Your ready_url variable ends up having a double slash in it (i.e. https://www.brainyquote.com//search_results?q=testing). If you test that in a browser or with curl, you'll see that yields no results. If you fix the definition of url so it doesn't have the trailing slash (i.e. url='https://www.brainyquote.com'), your code will work.

Related

Python Get Request All Pages Movie list

While using below snippet it is not returning values of Page, Total page and data.
Also not returning the value of function "getMovieTitles".
import request
import json
def getMovieTitles(substr):
titles = []
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}'.format(substr)"
data = requests.get(url)
print(data)
response = json.loads(data.content.decode('utf-8'))
print(data.content)
for page in range(0, response['total_pages']):
page_response = requests.get("https://jsonmock.hackerrank.com/api/movies/search/?Title={}}&page={}".format(substr, page + 1))
page_content = json.loads(page_response.content.decode('utf-8'))
print ('page_content', page_content, 'type(page_content)', type(page_content))
for item in range(0, len(page_content['data'])):
titles.append(str(page_content['data'][item]['Title']))
titles.sort()
return titles
print(getMovieTitles('Superman'))
You're not formatting the url string correctly.
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}'.format(substr)"
format() is a method of string and you've put it inside of the url string, instead do:
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}".format(substr)
First, import
import requests
The problem is in your string formatting
' instead of "
url = "https://jsonmock.hackerrank.com/api/movies/search/?Title={}".format(substr)
and one } too much
page_response = requests.get("https://jsonmock.hackerrank.com/api/movies/search/?Title={}&page={}".format(substr, page + 1))

404: [33mGET /search?q=books HTTP/1.1[0m" 404 -

I am new to the web-scraping and making an API, facing an error while scraping an e-commerce website. Below is my python code please guide me through the same, I am getting "The requested URL was not found on the server." while running on a local-host.
from flask import Flask , request , jsonify
from bs4 import BeautifulSoup
import requests
app = Flask(__name__)
#app.route('/',methods=['GET'])
def API():
if request.method == 'GET':
uri = 'https://www.flipkart.com'
query = str(request.args['query'])
print(query)
if " " in query:
query = str(query).replace(" ","+")
else:
pass
search = '/search?q=' + query
ready_uri = uri + search
print(ready_uri)
content = requests.get(ready_uri).content
soup = BeautifulSoup(content, 'html.parser')
quotes_links = soup.find_all('a', {'class': '_3O0U0u'})
l = []
for i in quotes_links:
d = {}
quote_url = uri + i.get('href')
quote_content = requests.get(quote_url).content
quote_soup = BeautifulSoup(quote_content, 'html.parser')
d['quote'] = quote_soup.find('p', {'class': '_3wU53n'}).text
d['author'] = quote_soup.find('p', {'class': '_1vC4OE _2rQ-NK'}).text
l.append(d)
return jsonify(l)
if __name__ == '__main__':
app.run()
Error:
[33mGET /search?q=books HTTP/1.1[0m" 404 -
How do you get a query string on Flask?
You appear to be getting the query argument incorrectly.
query = str(request.args['query'])
When it should be:
query = str(request.args.get('query'))
Doing so returns a 200 but with blank data. I would suggest looking at the element your scraping:
quotes_links = soup.find_all('a', {'class': '_3O0U0u'})
Once you obtain the correct element with soup, you should start seeing return data.

Problems with data retrieving using Python web scraping

I wrote a simple code for scraping data from a web page but I mention all the thing like object class with tag but my program does not scrape data. One more thing there is an email that I also want to scrape but not know how to mention its id or class. Could you please guide me - how can I fix this issue? Thanks!
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('hi',class_="page-header",id=False).text
except:
title = 'empty'
print(title)
try:
email = soup.find('',class_="",id=False).text
except:
email = 'empty'
print(email)
def main():
url = "https://www.igrc.org/clergydetail/2747164"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
As noticed the value of the email is not in plain text. The html is loaded via JS in a script tag :
<script type="text/javascript">document.write(String.fromCharCode(60,97,32,104,114,101,102,61,34,35,34,32,115,116,121,108,101,61,34,117,110,105,99,111,100,101,45,98,105,100,105,58,98,105,100,105,45,111,118,101,114,114,105,100,101,59,100,105,114,101,99,116,105,111,110,58,114,116,108,59,34,32,111,110,99,108,105,99,107,61,34,116,104,105,115,46,104,114,101,102,61,83,116,114,105,110,103,46,102,114,111,109,67,104,97,114,67,111,100,101,40,49,48,57,44,57,55,44,49,48,53,44,49,48,56,44,49,49,54,44,49,49,49,44,53,56,44,49,49,52,44,49,49,49,44,57,56,44,54,52,44,49,48,57,44,49,48,49,44,49,49,54,44,49,48,52,44,49,49,49,44,49,48,48,44,49,48,53,44,49,49,53,44,49,49,54,44,52,54,44,57,57,44,57,57,41,59,34,62,38,35,57,57,59,38,35,57,57,59,38,35,52,54,59,38,35,49,49,54,59,38,35,49,49,53,59,38,35,49,48,53,59,38,35,49,48,48,59,38,35,49,49,49,59,38,35,49,48,52,59,38,35,49,49,54,59,38,35,49,48,49,59,38,35,49,48,57,59,38,35,54,52,59,38,35,57,56,59,38,35,49,49,49,59,38,35,49,49,52,59,60,47,97,62));</script>
which contains all the characters code (ascii code). When decoded will gives :
cc.tsidohtem#bor
which needs to be decoded too. We just needs the mailto which is present in onclick (the content in the mailto is unchanged whereas the text of the a tag is reversed (using direction: rtl as noticed by Hugo) :
mailto:john#doe.inc
The following python code extracts the mail :
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.igrc.org/clergydetail/2747164")
soup = BeautifulSoup(r.text, 'html.parser')
titleContainer = soup.find(class_ = "page-header")
title = titleContainer.text.strip() if titleContainer else "empty"
emailScript = titleContainer.findNext("script").text
def parse(data):
res = re.search('\(([\d+,]*)\)', data, re.IGNORECASE)
return "".join([
chr(int(i))
for i in res.group(1).split(",")
])
emailData1 = parse(emailScript)
email = parse(emailData1)
print(title)
print(email.split(":")[1])
One could reproduce this encoding the other way around using the following code :
def encode(data):
return ",".join([str(ord(i)) for i in data])
mail = "john#doe.inc"
encodedMailTo = encode("mailto:" + mail)
encodedHtmlEmail = "".join(["&#" + str(ord(i)) + ";" for i in mail])
htmlContainer = f'{encodedHtmlEmail}'
encodedHtmlContainer = encode(htmlContainer)
scriptContainer = f'<script type="text/javascript">document.write(String.fromCharCode({encodedHtmlContainer}));</script>'
print(scriptContainer)

Having problems with a simple Instagram Scraper

i am pretty new to all the programming stuff and I am learning Python for my social engineering project. So really sorry if you will hit your own forehead.
So now i was looking at a tutorial to scrape certain information from a certain instagram page. Lets say f.e. i wanted to extract info from www.instagram.com/nbamemes
I am getting a problem in Line 12 "IndentationError: expected an indented block". So i have googled that, but i just dont get the Code. Where are my placeholders which i need to place info from myself.
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
class insta_Scraper_v1:
def getinfo(self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all('meta', attr={'property': 'og:description'})
text = data[0]
user = '%s %s %s' % (text[-3], text[-2], text[-1])
followers = text[0]
following = text[2]
posts = text[4]
print('User:', user)
print('Followers:', followers)
print('Following:', following)
print('Posts:', posts)
print('-----------------------')
def mail(self):
self.ctx = ssl.create_default_context()
self.ctx.check_hostname = False
self.ctx.verify_mode = ssl.CERT_NONE
with open('123.txt') as f:
self.content = f.readlines()
self.content = [x.strip() for x in self.content]
for url in self.content:
self.getinfo(url)
if __name__ == '__main__'
obj = insta_Scraper_v1()
obj.mail()
I used a Tutorial for programming this. However I dont get the whole thing right. Its not completely beginner friendly and I seem to need help. Again sorry for this super beginners question.
beste regards,
lev
In the future, it would be useful to share the error message produced by your code. It includes the line at which the error has occurred.
Based on the code you provided, I can see that you did not indent the code inside your functions. After the function declaration def, you need to indent all code inside it
So from:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
To:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
Indentations are the block separators in python. Below is the indented code. Whenever you are using the condition loops , def , class you are creating a block. In order to define that you have to indent the code using spaces . Usually a tab space is been preferred , but even a single space also works fine .
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
class insta_Scraper_v1:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
text = data[0]
user = '%s %s %s' % (test[-3], text[-2], text[-1])
followers = text[0]
following = text[2]
posts = text[4]
print ('User:', user)
print ( 'Followers:', followers)
print ('Following:', following)
print ('Posts:', posts)
print ('-----------------------')
def mail(self:
self.ctx = ssl.create_default_context()
self.ctx.check_hostname = False
self.ctx.verify_mode = ssl.CERT_NONE
with open('123.txt') as f:
self.content = f.readlines()
self.content = [x.strip() for x in self.content]
for url in self.content:
self.getinfo(url)
if __name__ == '__main__'
obj = insta_Scraper_v1()
obj.main()
Ref : Geeks For Geeks : Indentation
Thanks

Getting all links of a websites

Hi I wanted to create a mini crawler but not use Scrapy,
I created something like this:
response = requests.get(url)
homepage_link_list = []
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
homepage_link_list.append(link.get("href"))
link_list = []
for item in homepage_link_list:
response = requests.get(item)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
link_list.append(link.get("href"))
Although the problem I am encountering is that it only get the the link within the link of webpage, how can I make it do get all the links within all the links of website.
You need a recursive call flow. I have written below a class-oriented code. Main points are as follows:
This implementation is depth-first
Keep track of already scraped URLs so that we don't scrape them again
Ignore targets on a page. Eg. if http://example.com#item1, ignore item1
If https://example.com is already crawled, ignore http://example.com
Discard trailing slash. Eg. if http://example.com is already crawled, ignore http://example.com/
''' Scraper.
'''
import re
from urllib.parse import urljoin, urlsplit, SplitResult
import requests
from bs4 import BeautifulSoup
class RecursiveScraper:
''' Scrape URLs in a recursive manner.
'''
def __init__(self, url):
''' Constructor to initialize domain name and main URL.
'''
self.domain = urlsplit(url).netloc
self.mainurl = url
self.urls = set()
def preprocess_url(self, referrer, url):
''' Clean and filter URLs before scraping.
'''
if not url:
return None
fields = urlsplit(urljoin(referrer, url))._asdict() # convert to absolute URLs and split
fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing /
fields['fragment'] = '' # remove targets within a page
fields = SplitResult(**fields)
if fields.netloc == self.domain:
# Scrape pages of current domain only
if fields.scheme == 'http':
httpurl = cleanurl = fields.geturl()
httpsurl = httpurl.replace('http:', 'https:', 1)
else:
httpsurl = cleanurl = fields.geturl()
httpurl = httpsurl.replace('https:', 'http:', 1)
if httpurl not in self.urls and httpsurl not in self.urls:
# Return URL only if it's not already in list
return cleanurl
return None
def scrape(self, url=None):
''' Scrape the URL and its outward links in a depth-first order.
If URL argument is None, start from main page.
'''
if url is None:
url = self.mainurl
print("Scraping {:s} ...".format(url))
self.urls.add(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
childurl = self.preprocess_url(url, link.get("href"))
if childurl:
self.scrape(childurl)
if __name__ == '__main__':
rscraper = RecursiveScraper("http://bbc.com")
rscraper.scrape()
print(rscraper.urls)
It could be that the links you want to scrape are not actually links. They could be images. Sorry for writing this answer here actally I dont have much reputation to comment,
Your code is not fetching all the links of the website because it is not recursive​. You're fetching the homepage links and traversing the links available in the content of the homepage links. But, you're not traversing the links you get in the content of those links you just traversed. My advice is you should check out some tree traversal algorithms and develop a scheme of traversal (recursive) according to the algorithm. The nodes of the trees will represent the links, root node being the link you passed in the beginning.
95% based on #coder.in.me answer let me insert another code here that can resolve an issue I was facing with.
My issue was: "If you try to scrape a url like: https://www.americanexpress.com/hu-hu/, it will only keep the https://www.americanexpress.com/ part of it and scrape all the amex sites globally, but I don't need all the non-hungarian pages."
You just need to change the
if fields.netloc == self.domain:
code to
if fields.netloc == self.domain and (fields.path.startswith('/hu-hu') or fields.path.startswith('/en-hu')):
Here is the modified code:
import re
from urllib.parse import urljoin, urlsplit, SplitResult
import requests
from bs4 import BeautifulSoup
class RecursiveScraper:
''' Scrape URLs in a recursive manner.
'''
def __init__(self, url):
''' Constructor to initialize domain name and main URL.
'''
self.domain = urlsplit(url).netloc
self.mainurl = url
self.urls = set()
def preprocess_url(self, referrer, url):
''' Clean and filter URLs before scraping.
'''
if not url:
return None
fields = urlsplit(urljoin(referrer, url))._asdict() # convert to absolute URLs and split
fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing /
fields['fragment'] = '' # remove targets within a page
fields = SplitResult(**fields)
#if fields.netloc == self.domain:
if fields.netloc == self.domain and (fields.path.startswith('/hu-hu') or fields.path.startswith('/en-hu')):
# Scrape pages of current domain only
if fields.scheme == 'http':
httpurl = cleanurl = fields.geturl()
httpsurl = httpurl.replace('http:', 'https:', 1)
else:
httpsurl = cleanurl = fields.geturl()
httpurl = httpsurl.replace('https:', 'http:', 1)
if httpurl not in self.urls and httpsurl not in self.urls:
# Return URL only if it's not already in list
return cleanurl
return None
def scrape(self, url=None):
''' Scrape the URL and its outward links in a depth-first order.
If URL argument is None, start from main page.
'''
if url is None:
url = self.mainurl
print("Scraping {:s} ...".format(url))
try:
response = requests.get(url)
self.urls.add(url)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
childurl = self.preprocess_url(url, link.get("href"))
if childurl:
self.scrape(childurl)
except requests.exceptions.SSLError:
pass
except requests.exceptions.InvalidSchema:
pass
if __name__ == '__main__':
rscraper = RecursiveScraper('https://www.americanexpress.com/hu-hu/')
rscraper.scrape()
Thanks!

Categories