Find specific Tag Python BeautifulSoup - python

Hey I'm trying to extract URLs between 2 tags
This is what i got so far:
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
links = []
for links in soup.findAll('cite'):
print(links.get('cite'))
I have tried different things but I couldn't extract the URL between
<cite>.....</cite>
My code Updated
import requests
from bs4 import BeautifulSoup as bs
dorks = input("Keyword : ")
binglist = "http://www.bing.com/search?q="
with open(dorks , mode="r",encoding="utf-8") as my_file:
for line in my_file:
clean = binglist + line
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
r = requests.get(clean, headers=headers)
soup = bs(r.text, 'html.parser')
links = soup.find('cite')
print(links)
In keyword file you just need to put any keyword like :
test
games
Thanks for your help

You can do it as follows:
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
links = soup.find('cite')
for link in links:
print(link.text)
You can webscrape Bing as follows:
import requests
from bs4 import BeautifulSoup as bs
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
r = requests.get("https://www.bing.com/search?q=test", headers=headers)
soup = bs(r.text, 'html.parser')
links = soup.find('cite')
for link in links:
print(link.text)
This code does the following:
With request we get the Web Page we're looking for. We set headers to avoid being blocked by Bing (more information, see: https://oxylabs.io/blog/5-key-http-headers-for-web-scraping)
Then we HTML'ify the code, and extract all codetags (this returns a list)
For each element in the list, we only want what's inside the codetag, using .text we print the inside of this tag.
Please pay attention to the headers!

Try this:
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
links = soup.find_all('cite')
for link in links:
print(link.text)

You're looking for this to get links from Bing organic results:
# container with needed data: title, link, snippet, etc.
for result in soup.select(".b_algo"):
link = result.select_one("h2 a")["href"]
Specifically for example provided by you:
from bs4 import BeautifulSoup
html_doc = '<div class="b_attribution" u="1|5075|4778623818559697|b0YAhIRjW_h9ERBLSt80gnn9pWk7S76H"><cite>https://www.developpez.net/forums/d1497343/environnements-developpem...</cite><span class="c_tlbxTrg">'
soup = BeautifulSoup(html_doc, "html.parser")
link = soup.select_one('.b_attribution cite').text
print(link)
# https://www.developpez.net/forums/d1497343/environnements-developpem...
Code and example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
}
params = {
"q": "lasagna",
"hl": "en",
}
html = requests.get("https://www.bing.com/search", headers=headers, params=params)
soup = BeautifulSoup(html.text, "lxml")
for links in soup.select(".b_algo"):
link = links.select_one("h2 a")["href"]
print(link)
------------
'''
https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/
https://www.foodnetwork.com/topics/lasagna
https://www.tasteofhome.com/recipes/best-lasagna/
https://www.simplyrecipes.com/recipes/lasagna/
'''
Alternatively, you can achieve the same thing by using Bing Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to deal with extraction, maintain, bypass from the blocks part, instead, you only need to iterate over structured JSON and get what you want.
Code to integrate to achieve your goal:
from serpapi import GoogleSearch
import os
params = {
"api_key": os.getenv("API_KEY"),
"engine": "bing",
"q": "lion king"
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['organic_results']:
link = result['link']
print(link)
------------
'''
https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/
https://www.foodnetwork.com/topics/lasagna
https://www.tasteofhome.com/recipes/best-lasagna/
https://www.simplyrecipes.com/recipes/lasagna/
'''
Disclaimer, I work for SerpApi.

Related

scraping comments from booking.com

I'm trying to get all the reviews from a specific hotel page in booking.com
I have tried this code but I'm not getting anything printed at all.
This is the code I tried:
import urllib.request
from bs4 import BeautifulSoup
url='https://www.booking.com/hotel/sa/sarwat-park.ar.html?aid=304142&label=gen173nr-1DCAEoggI46AdIM1gEaMQBiAEBmAERuAEHyAEM2AED6AEBiAIBqAIDuAL_oY-aBsACAdICJDE5YzYxY2ZiLWRlYjUtNDRjNC04Njk0LTlhYWY4MDkzYzNhNNgCBOACAQ&sid=c7009aac67195c0a7ef9aa63f6537581&dest_id=6376991;dest_type=hotel;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;req_adults=2;req_children=0;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1665388865;srpvid=1219386046550156;type=total;ucfs=1&#tab-reviews'
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
}
)
f = urllib.request.urlopen(req)
soup = BeautifulSoup(f.read().decode('utf-8'), 'html.parser')
reviews = soup.findAll("li", {"class": "review_item clearfix "})
for review in reviews:
print(review.find("div", {"class": "review_item_header_content"}).text)
To begin with, there is no class "review_item" on the entire page.
A better approach would be to just use the etree to find and get details from the xPath of the reviews list that you have now.
//*[#id="b2hotelPage"]/div[25]/div/div/div/div[1]/div[2]/div/ul
Then you could do something like
webpage = req.get(URL, headers=headers)
soup = bs(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
listTarget = dom.xpath('//*[#id="b2hotelPage"]/div[25]/div/div/div/div[1]/div[2]/div/ul')
This should give you a list of lxml objects which are essentially your comment cards.
Then you can work on them in a similar fashion

scraping google search results page data python

i want to scrape emails on search resulted query. but when i access to class with css selecter "select" and print it always shows empty list. How can i access .r class or "class=g"?
import requests
from bs4 import BeautifulSoup
url = "https://www.google.com/search?sxsrf=ACYBGNQA4leQETe0psVZPu7daLWbdsc9Ow%3A1579194494737&ei=fpggXpvRLMakwQKkqpSICg&q=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&oq=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&gs_l=psy-ab.12...0.0..7407...0.0..0.0.0.......0......gws-wiz.82okhpdJLYg&ved=0ahUKEwibiI_3zYjnAhVGUlAKHSQVBaEQ4dUDCAs"
responce = requests.get(url)
soup = BeautifulSoup(responce.text, "html.parser")
test = soup.select('.r')
print(test)
Your program is correct, but to get correct answer from Google, you need to specify User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://www.google.com/search?sxsrf=ACYBGNQA4leQETe0psVZPu7daLWbdsc9Ow%3A1579194494737&ei=fpggXpvRLMakwQKkqpSICg&q=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&oq=%22computer+science+%22%22usa%22+%22%40yahoo.com%22&gs_l=psy-ab.12...0.0..7407...0.0..0.0.0.......0......gws-wiz.82okhpdJLYg&ved=0ahUKEwibiI_3zYjnAhVGUlAKHSQVBaEQ4dUDCAs"
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0'}
responce = requests.get(url, headers=headers) # <-- specify custom header
soup = BeautifulSoup(responce.text, "html.parser")
test = soup.select('.r')
print(test)
Prints:
[<div class="r"><a href="https://www.yahoo.com/news/11-course-complete-computer-science-171322233.html" onmousedown="return rwt(this,'','','','1','AOvVaw2wM4TUxc_4V7s9GjeWTNAG','','2ahUKEwjt17Kk-YjnAhW2R0EAHcnsC3QQFjAAegQIAxAB','','',event)"><div class="TbwUpd"><img alt="https://...
...
To get the emails out of the Google Search results you need to use regex
# this regex needs possible modifications
re.findall(r'[\w\.-]+#[\w\.-]+\.\w+', variable_where_to_search_from)
Code:
from bs4 import BeautifulSoup
import requests, lxml, re
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
"Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.com/search?q="computer science ""usa" "#yahoo.com"', headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select('.tF2Cxc'):
try:
snippet = result.select_one('.lyLwlc').text
except:
snippet = None
match_email = re.findall(r'[\w\.-]+#[\w\.-]+\.\w+', str(snippet))
email = '\n'.join(match_email).strip()
print(email)
----------
'''
ahmed_733#yahoo.com
yjzou#uguam.uog
yzou2002#yahoo.com
...
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
It doesn't extract emails using regex although it would be a great possible feature. The main difference is that much easier and faster to get things done rather than creating everything from scratch.
Code to integrate:
from serpapi import GoogleSearch
import re
params = {
"api_key": "YOUR_API_KEY",
"engine": "google",
"q": '"computer science ""usa" "#yahoo.com"',
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['organic_results']:
try:
snippet = result['snippet']
except:
snippet = None
match_email = re.findall(r'[\w\.-]+#[\w\.-]+\.\w+', str(snippet))
email = '\n'.join(match_email).strip()
print(email)
---------
'''
shaikotweb#yahoo.com
ahmed_733#yahoo.com
RPeterson#L1id.com
rj_peterson#yahoo.com
'''
Disclaimer, I work for SerpApi.

Beautifulsoup is returning double links

I am trying to learn how to scrape websites and therefore not using an API. I am trying to scrape eBay's websites and my script will print double URL. I did my due diligence and search on Google/StackOverflow help but was unable to find any solution. Thanks in advance.
driver.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=watches&_sacat=0&_pgn=' + str(i))
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.maximize_window()
tempList = []
for link in soup.find_all('a', href=True):
if 'itm' in link['href']:
print(link['href'])
tempList.append(link['href'])
Entire code: https://pastebin.com/q41eh3Q6
Just add the class name while searching for all the links.Hope this helps.
i=1
driver.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=watches&_sacat=0&_pgn=' + str(i))
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.maximize_window()
tempList = []
for link in soup.find_all('a',class_='s-item__link', href=True):
if 'itm' in link['href']:
print(link['href'])
tempList.append(link['href'])
print(len(tempList))
You're looking for this:
# container with needed data: title, link, price, condition, number of reviews, etc.
for item in soup.select('.s-item__wrapper.clearfix'):
# only link will be extracted from the container
link = item.select_one('.s-item__link')['href']
Code and full example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.ebay.com/sch/i.html?_nkw=Wathces', headers=headers).text
soup = BeautifulSoup(html, 'lxml')
temp_list = []
for item in soup.select('.s-item__wrapper.clearfix'):
link = item.select_one('.s-item__link')['href']
temp_list.append(link)
print(link)
------------
'''
https://www.ebay.com/itm/203611966827?hash=item2f68380d6b:g:pBAAAOSw1~NhRy4Y
https://www.ebay.com/itm/133887696438?hash=item1f2c541e36:g:U3IAAOSwBKthN4yg
https://www.ebay.com/itm/154561925393?epid=26004285120&hash=item23fc9bd111:g:TWUAAOSwf3pgNP08
https://www.ebay.com/itm/115010872425?hash=item1ac72ea469:g:yQsAAOSweMBhT4gs
https://www.ebay.com/itm/115005461839?epid=1776383383&hash=item1ac6dc154f:g:QskAAOSwDe9hS7Ys
https://www.ebay.com/itm/224515689673?hash=item34462d8cc9:g:oTwAAOSwAO5gna8u
https://www.ebay.com/itm/124919898822?hash=item1d15ce62c6:g:iEoAAOSwhAthQnX9
https://www.ebay.com/itm/133886767671?hash=item1f2c45f237:g:htkAAOSwNAhhQOyf
https://www.ebay.com/itm/115005341920?hash=item1ac6da40e0:g:4SIAAOSwWi1hR5Mx
...
'''
Alternatively, you can achieve the same thing by using eBay Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't have to deal with the extraction process and maintain it over time, instead, you only need to iterate over structured JSON and get the data you want.
Code to integrate:
from serpapi import GoogleSearch
import os
params = {
"engine": "ebay",
"ebay_domain": "ebay.com",
"_nkw": "watches",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
temp_list = []
for result in results['organic_results']:
link = result['link']
temp_list.append(link)
print(link)
------------
'''
https://www.ebay.com/itm/203611966827?hash=item2f68380d6b:g:pBAAAOSw1~NhRy4Y
https://www.ebay.com/itm/133887696438?hash=item1f2c541e36:g:U3IAAOSwBKthN4yg
https://www.ebay.com/itm/154561925393?epid=26004285120&hash=item23fc9bd111:g:TWUAAOSwf3pgNP08
https://www.ebay.com/itm/115010872425?hash=item1ac72ea469:g:yQsAAOSweMBhT4gs
https://www.ebay.com/itm/115005461839?epid=1776383383&hash=item1ac6dc154f:g:QskAAOSwDe9hS7Ys
https://www.ebay.com/itm/224515689673?hash=item34462d8cc9:g:oTwAAOSwAO5gna8u
https://www.ebay.com/itm/124919898822?hash=item1d15ce62c6:g:iEoAAOSwhAthQnX9
https://www.ebay.com/itm/133886767671?hash=item1f2c45f237:g:htkAAOSwNAhhQOyf
https://www.ebay.com/itm/115005341920?hash=item1ac6da40e0:g:4SIAAOSwWi1hR5Mx
...
'''
P.S - I wrote a bit more in-depth blog post about how to scrape eBay search with Python.
Disclaimer, I work for SerpApi.

Exact website links from google through BeautifulSoup

I want to search google using BeautifulSoup and open the first link. But when I opened the link it shows error. The reason i think is that because google is not providing exact link of website, it has added several parameters in url. How to get exact url?
When i tried to use cite tag it worked but for big urls its creating problem.
The first link which i get using soup.h3.a['href'][7:] is:
'http://www.wikipedia.com/wiki/White_holes&sa=U&ved=0ahUKEwi_oYLLm_rUAhWJNI8KHa5SClsQFggbMAI&usg=AFQjCNGN-vlBvbJ9OPrnq40d0_b8M0KFJQ'
Here is my code:
import requests
from bs4 import Beautifulsoup
r = requests.get('https://www.google.com/search?q=site:wikipedia.com+Black+hole&gbv=1&sei=YwHNVpHLOYiWmQHk3K24Cw')
soup = BeautifulSoup(r.text, "html.parser")
print(soup.h3.a['href'][7:])
You could split the returned string:
url = soup.h3.a['href'][7:].split('&')
print(url[0])
hope by clubbing all answer together presented above ,your code will look like
this:
from bs4 import BeautifulSoup
import requests
import csv
import os
import time
url = "https://www.google.co.in/search?q=site:wikipedia.com+Black+hole&dcr=0&gbv=2&sei=Nr3rWfLXMIuGvQT9xZOgCA"
r = requests.get(url)
data = r.text
url1 = "https://www.google.co.in"
soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all("div", attrs={"class":"g"})
final_data = []
for details in get_details:
link = details.find_all("h3")
#links = ""
for mdetails in link:
links = mdetails.find_all("a")
lmk = ""
for lnk in links:
lmk = lnk.get("href")[7:].split("&")
sublist = []
sublist.append(lmk[0])
final_data.append(sublist)
filename = "Google.csv"
with open("./"+filename, "w")as csvfile:
csvfile = csv.writer(csvfile, delimiter=",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])
It's much simpler. You're looking for this:
# instead of this:
soup.h3.a['href'][7:].split('&')
# use this:
soup.select_one('.yuRUbf a')['href']
Code and example in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "site:wikipedia.com black hole", # query
"gl": "us", # country to search from
"hl": "en" # language
}
html = requests.get("https://www.google.com/search", headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
first_link = soup.select_one('.yuRUbf a')['href']
print(first_link)
# https://en.wikipedia.com/wiki/Primordial_black_hole
Alternatively, you can achieve the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you only need to extract the data from the structured JSON rather than figuring out why things don't work and then maintain it over time if some selectors will change.
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"engine": "google",
"q": "site:wikipedia.com black hole",
"hl": "en",
"gl": "us",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
# [0] - first index of search results
first_link = results['organic_results'][0]['link']
print(first_link)
# https://en.wikipedia.com/wiki/Primordial_black_hole
Disclaimer, I work for SerpApi.

how to extract amazon product links using python

I'm a Beginner in Python, I just want to scrap product links from amazon page.
for example, I want to scrap this page
http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368 and I use this code in python
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
file = open("parseddata.txt", "wb")
links = soup.find_all('a', {'class': 'a-link-normal s-access-detail-page a-text-normal'})
for link in links:
print(link.get('href'))
file.write(href + '\n')
file.close()
I Just want the products title link as the output. Can anyone tell me where I am doing wrong.
Add an user-agent to the request header to pretend that you are not a robot.
from bs4 import BeautifulSoup
import requests
url = "http://www.amazon.com/s/ref=sr_in_-2_p_4_18?me=A3MZ96G5C78IVQ&fst=as%3Aoff&rh=p_4%3AFunKo&ie=UTF8&qid=1477811368"
# add header
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
file = open(r"parseddata.txt", "w")
links = soup.find_all('a', {'class': 'a-link-normal s-access-detail-page a-text-normal'})
for link in links:
print(link.get('href'))
file.write(link.get('href')+ '\n')
file.close()
Result
https://www.amazon.com/Funko-POP-Marvel-Dancing-Bobble/dp/B00N1EJXUU/ref=sr_1_1/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-1&refinements=p_4%3AFunKo
https://www.amazon.com/Funko-POP-Movies-Potter-Action/dp/B019JIA4IQ/ref=sr_1_2/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-2&refinements=p_4%3AFunKo
https://www.amazon.com/FunKo-2390-Funko-Darth-Maul/dp/B005F1QBMK/ref=sr_1_3/160-5408618-6684940?m=A3MZ96G5C78IVQ&s=merchant-items&ie=UTF8&qid=1477822032&sr=1-3&refinements=p_4%3AFunKo
........

Categories