Why is Beautiful Soup Returning duplicate results?

Why is Beautiful Soup Returning duplicate results? - python

I am creating a project that scrapes indeeds website and it was working fine but when I ran it today, all of a sudden without having made any changes, instead of returning the entire page of results, it no only displays the first result in duplicates. May someone help me correct this
from tkinter import *
import random
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import requests
html_text = requests.get('https://www.ign.com/').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find('section',class_='right')
#print(html_text)
driver = webdriver.Chrome(executable_path='/Users/Miscellaneous/PycharmProjects/RecursivePractice/chromedriver')
url= "https://www.indeed.com/jobs?q=developer&l=Westbury%2C%20NY&vjk=0b0cbe29e5f86422"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
officials = soup.findAll("a",{"class":"tapItem"}
for official in officials:
jobTitle = soup.find('h2',{'class': 'jobTitle'}).text
companyName = soup.find('div',{'class': 'comapny_location'})
location = soup.find('div',{'class': 'companyLocation'}).text
salary = soup.find('div',{'class': 'salary-snippet'})
actualSalary = salary.find('span').text
summary = soup.find('div',{'class': 'job-snippet'}).text
print('Title: ' + str(jobTitle) + '\nCompany Name: ' + str(companyName) + '\nLocation: ' + str(location)
+ '\nSalary: ' + str(actualSalary) + "\nSummary: " + str(summary))
#print(str(official))
print(' ')
driver.quit()

Try this
from tkinter import *
import random
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import requests
html_text = requests.get('https://www.ign.com/').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find('section',class_='right')
driver = webdriver.Chrome(executable_path='/Users/Miscellaneous/PycharmProjects/RecursivePractice/chromedriver')
url= "https://www.indeed.com/jobs?q=developer&l=Westbury%2C%20NY&vjk=0b0cbe29e5f86422"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
officials = soup.findAll("a",{"class":"tapItem"})
for i in range(len(officials)):
jobTitle = soup.findAll('h2',{'class': 'jobTitle'})[i].text
companyName = soup.findAll('div',{'class': 'comapny_location'})[i].text if len(soup.findAll('div',{'class': 'comapny_location'})) > i else "NULL"
location = soup.findAll('div',{'class': 'companyLocation'})[i].text if len(soup.findAll('div',{'class': 'companyLocation'})) > i else "NULL"
salary = soup.findAll('div',{'class': 'salary-snippet'})[i].text if len(soup.findAll('div',{'class': 'salary-snippet'})) > i else "NULL"
actualSalary = salary.find('span')
summary = soup.findAll('div',{'class': 'job-snippet'})[i].text if len(soup.findAll('div',{'class': 'job-snippet'})) > i else "NULL"
print('Title: ' + str(jobTitle) + '\nCompany Name: ' + str(companyName) + '\nLocation: ' + str(location)
+ '\nSalary: ' + str(actualSalary) + "\nSummary: " + str(summary))
print(' ')
driver.quit()

Related

Simple Web Scraper returning no data

Im trying to scrape data from a webpage but its returning ["F"] ["F"] which is what it should do if no data has been retrieved. Please see Code below
`
import pandas as pd
import datetime
import requests
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
def web_content_div(web_content, class_path):
web_content_div = web_content.find_all('div', {"class": class_path})
try:
spans = web_content_div[0].find_all('span')
texts =[span.get_text() for span in spans]
except IndexError:
texts=[]
return texts
def real_time_price(stock_code):
url = 'https://finance.yahoo.com/quote/' + stock_code + '?p=' + stock_code + '%27&.tsrc=fin-srch'
# 'https://finance.yahoo.com/quote/' + stock_code + '?p=' + stock_code + '&.tsrc=fin-srch'
try:
r = requests.get(url)
web_content = BeautifulSoup(r.text, 'lxml')
texts = web_content_div(web_content, 'My(6px) Pos(r) smarthphone_Mt(6px) W(100&%')
if texts != []:
price, change = texts[0], texts[1]
else:
price, change = ["F"], ["F"]
except ConnectionError:
price, change = [""], [""]
return price, change
Stock = ["BRK-B"]
print(real_time_price("BRK-B"))`

Your class_path doesn't exist due to a couple of typos. Website in question references "My(6px) Pos(r) smartphone_Mt(6px) W(100%)", which I believe is what you're targeting.

How can I remove the p tag of the data being grabbed

I am trying the snippet to grab the p inside the div. When running the script, the output includes its all formatting tags.
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
driver = webdriver.Chrome('chromedriver.exe')
url = 'https://poocoin.app/rugcheck/0xf09b7b6ba6dab7cccc3ae477a174b164c39f4c66/dev-activity'
driver.get(url)
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
pdata = soup.find_all('div',attrs={"class":"mt-2"})
for x in pdata:
print (x.find('p'))
driver.quit()
Current Output:
<p>Go to chart</p>
<p>This is a log of activity related to the token from all wallets that have had ownership of the contract.</p>
<p>Wallet activity for 0x410e372657e088d5b7db76346cd958b1b642b984<br/><span class="text-muted text-small">(Ownership transferred to 0x0000000000000000000000000000000000000000 on 4/17/2021, 4:59:30 AM)</span></p>
Wanted Output:
0xf09b7b6ba6dab7cccc3ae477a174b164c39f4c66
Wallet activity for 0x410e372657e088d5b7db76346cd958b1b642b984
(Ownership transferred to 0x0000000000000000000000000000000000000000 on 17/04/2021, 4:59:30 am)

You can do this with regular expressions:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
driver = webdriver.Chrome('chromedriver.exe')
url = 'https://poocoin.app/rugcheck/0xf09b7b6ba6dab7cccc3ae477a174b164c39f4c66/dev-activity'
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
pdata = soup.find_all('div',attrs={"class":"mt-2"})
lines = [str(x.find('p')) for x in pdata]
address = re.search('/tokens/(0x\w+)"', lines[1]).group(1)
print(address)
activity = 'Wallet activity for ' + re.search('/address/(0x\w+)"', lines[3]).group(1)
print(activity)
matches = re.search('"_blank">(0x\w+)</a>( on [^\)]+)\)', lines[3])
ownership = '(Ownership transferred to ' + matches.group(1) + matches.group(2) + ')'
print(ownership)
driver.quit()
Output:
0xf09b7b6ba6dab7cccc3ae477a174b164c39f4c66
Wallet activity for 0x410e372657e088d5b7db76346cd958b1b642b984
(Ownership transferred to 0x0000000000000000000000000000000000000000 on 16/04/2021, 21:59:30)

Try:
pdata = soup.select('div.mt-2 p )
for x in pdata:
print (x.text)

Web Scrape - Span Price issue. Printing the last items price in loop instead of price for each item

I'm almost happy with this script I've been putting together today. Its came with some help today (thanks everyone who has assisted thus far) and some suspect programming on my part but it is functional to a degree.
I want to dump the data to a JSON. It seems dump all the data correctly apart from the price (which was grabbed from <span></span>). I believe the issue lies with indenting but I'm not 100% sure.
Could someone cast their eye over this snippet and correct what I cannot see. Think I'm going blind with being unable to see the correct change with the amount of variations I've tried.
from bs4 import BeautifulSoup
import requests
import shutil
import csv
import pandas
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html")
time.sleep(2)
#beautiful soup requests
#URL = 'https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html'
#page = requests.get(URL)
#soup = BeautifulSoup(page.content, 'html.parser')
soup = BeautifulSoup(browser.page_source, features="lxml")
#products = soup.find_all("div", "GC62 Product")
products = soup.find_all("div", "GC62 Product")
for product in products:
#barrel lengths
barrels = product.find('select', attrs={'name': re.compile('length')})
if barrels:
barrels_list = [x['origvalue'][:2] for x in barrels.find_all('option')[1:]]
for y in range(0, len(barrels_list)):
#title
title = product.find("h3")
titleText = title.text if title else ''
#manufacturer name
manufacturer = product.find("div", "GC5 ProductManufacturer")
manuText = manufacturer.text if manufacturer else ''
#image location
img = product.find("div", "ProductImage")
imglinks = img.find("a") if img else ''
imglinkhref = imglinks.get('href') if imglinks else ''
imgurl = 'https://www.mcavoyguns.co.uk/contents'+imglinkhref
#description
description = product.find("div", "GC12 ProductDescription")
descText = description.text if description else ''
#descStr = str(descText)
#more description
more = product.find("div", "GC12 ProductDetailedDescription")
moreText = more.text if more else ''
#price
spans = browser.find_elements_by_css_selector("div.GC20.ProductPrice span")
for i in range(0,len(spans),2):
span = spans[i].text
i+=1
#print(span)
#print(barrels_list[y])
#print(titleText)
#print(manuText)
#print(descText)
#print(moreText)
#print(imgurl.replace('..', ''))
#print("\n")
x = {
"price": span,
"barrel length": barrels_list[y],
"title": titleText,
"manufacturer": manuText,
"description": descText,
"desc cont": moreText,
"image Location": imgurl.replace('..', '')
}
dump = json.dumps(x)
print(dump)
y+=1

I succeeded to make it work by modifying a little bit your code. Your last for loop is not really useful as you already found the tag of the product. Thus, you can do as follow :
from bs4 import BeautifulSoup
import requests
import shutil
import csv
import pandas
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html")
time.sleep(2)
#beautiful soup requests
#URL = 'https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html'
#page = requests.get(URL)
#soup = BeautifulSoup(page.content, 'html.parser')
soup = BeautifulSoup(browser.page_source, features="lxml")
#products = soup.find_all("div", "GC62 Product")
products = soup.find_all("div", "GC62 Product")
for product in products:
#barrel lengths
barrels = product.find('select', attrs={'name': re.compile('length')})
if barrels:
barrels_list = [x['origvalue'][:2] for x in barrels.find_all('option')[1:]]
#title
title = product.find("h3")
titleText = title.text if title else ''
#manufacturer name
manufacturer = product.find("div", "GC5 ProductManufacturer")
manuText = manufacturer.text if manufacturer else ''
#image location
img = product.find("div", "ProductImage")
imglinks = img.find("a") if img else ''
imglinkhref = imglinks.get('href') if imglinks else ''
imgurl = 'https://www.mcavoyguns.co.uk/contents' + imglinkhref
#description
description = product.find("div", "GC12 ProductDescription")
descText = description.text if description else ''
#more description
more = product.find("div", "GC12 ProductDetailedDescription")
moreText = more.text if more else ''
#price
price = product.findChild(name="span")
print("price : ", price)
price_raw = price.text
print("price_raw : ", price_raw)
price_replaced = price_raw.replace(',', '').replace('£', '')
print("price_replaced : ", price_replaced)
price_float = float(price_replaced)
for barrel in barrels_list:
x = {
"price": price_float,
"barrel length": barrel,
"title": titleText,
"manufacturer": manuText,
"description": descText,
"desc cont": moreText,
"image Location": imgurl.replace('..', '')
}
dump = json.dumps(x)
print(dump)
Do not hesitate if it still does not work!

Web Scraping - ResultSet object has no attribute 'findAll'

Having an issue with bs4 when reading second value in array within a for loop. Below I will paste the code.
However, when I use line #19, I receive no errors. When I swap it out for the entire array (line #18), It errors out when it attempts to gather the second value. Note that the second value in the array is the same value as line #19.
import requests
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
SmartLiving_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
Headers = "Description, URL, Price \n"
text_file = open("HayneedlePrices.csv", "w")
text_file.write(Headers)
text_file.close()
URL_Array = [SmartLiving_IDS, IEL_IDS, TD_IDS]
#URL_Array = [IEL_IDS]
for URL in URL_Array:
print("\n" + "Loading New URL:" "\n" + URL + "\n" + "\n")
uClient = uReq(URL)
page_html = uClient.read()
uClient.close()
soup = soup(page_html, "html.parser")
Containers = soup.findAll("div", {"product-card__container___1U2Sb"})
for Container in Containers:
Title = Container.div.img["alt"]
Product_URL = Container.a["href"]
Price_Container = Container.findAll("div", {"class":"product-card__productInfo___30YSc body no-underline txt-black"})[0].findAll("span", {"style":"font-size:20px"})
Price_Dollars = Price_Container[0].get_text()
Price_Cents = Price_Container[1].get_text()
print("\n" + "#####################################################################################################################################################################################################" + "\n")
# print(" Container: " + "\n" + str(Container))
# print("\n" + "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" + "\n")
print(" Description: " + str(Title))
print(" Product URL: " + str(Product_URL))
print(" Price: " + str(Price_Dollars) + str(Price_Cents))
print("\n" + "#####################################################################################################################################################################################################" + "\n")
text_file = open("HayneedlePrices.csv", "a")
text_file.write(str(Title) + ", " + str(Product_URL) + ", " + str(Price_Dollars) + str(Price_Cents) + "\n")
text_file.close()
print("Information gathered and Saved from URL Successfully.")
print("Looking for Next URL..")
print("No Additional URLs to Gather. Process Completed.")

The problem is that you import BeautifulSoup as soup and also define a variable soup = soup(page_html, "html.parser") with the same name!
I refactored your code a bit, let me know if it works as expected!
import csv
import requests
from bs4 import BeautifulSoup
smart_living_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
site_URLs = [smart_living_IDS, IEL_IDS, TD_IDS]
sess = requests.Session()
prod_data = []
for curr_URL in site_URLs:
req = sess.get(url=curr_URL)
soup = BeautifulSoup(req.content, "lxml")
containers = soup.find_all("div", {"product-card__container___1U2Sb"})
for curr_container in containers:
prod_title = curr_container.div.img["alt"]
prod_URL = curr_container.a["href"]
price_container = curr_container.find(
"div",
{"class": "product-card__productInfo___30YSc body no-underline txt-black"},
)
dollars_elem = price_container.find("span", {"class": "main-price-dollars"})
cents_elem = dollars_elem.find_next("span")
prod_price = dollars_elem.get_text() + cents_elem.get_text()
prod_price = float(prod_price[1:])
prod_data.append((prod_title, prod_URL, prod_price))
CSV_headers = ("title", "URL", "price")
with open("../out/hayneedle_prices.csv", "w", newline="") as file_out:
writer = csv.writer(file_out)
writer.writerow(CSV_headers)
writer.writerows(prod_data)
I tested it by repeating the current URL list 10 times, it took longer than I was anticipating. There are certainly improvements to be made, I might rewrite it to use lxml in the next few days, and multiprocessing might also be a good option. It all depends on how you're using this, of course :)

Python 2.7 Why I only write one line of a list to file instead of the whole list

Originally, my code is:
# encoding = utf-8
from bs4 import BeautifulSoup
import urllib
import re
import os
url = []
urlbase = "https://quizlet.com/subject/四级乱序/page/"
for i in range(0,2):
url.append(urlbase + str(i+1))
indexname = str(url[i])[-1] + ".html"
urllib.urlretrieve(url[i], indexname)
print indexname + " downloaded"
f = open(indexname,"r")
soup = BeautifulSoup(f, "html.parser")
linkclass = soup.find_all("a", attrs={"class":"SetPreview-link","href":re.compile(r"unit(\s\w+)?")})
for link in link class:
flink = link.get("href")
print flink
The result is a number of links, works just fine.
BUT when I write it to a file in code like this:
# encoding = utf-8
from bs4 import BeautifulSoup
import urllib
import re
import os
url = []
urlbase = "https://quizlet.com/subject/四级乱序/page/"
flinkfile = open("links.txt",'wb')
for i in range(0,2):
url.append(urlbase + str(i+1))
indexname = str(url[i])[-1] + ".html"
urllib.urlretrieve(url[i], indexname)
print indexname + " downloaded"
f = open(indexname,"r")
soup = BeautifulSoup(f, "html.parser")
linkclass = soup.find_all("a", attrs={"class":"SetPreview-link", "href":re.compile(r"unit(\s\w+)?")})
for link in linkclass:
flink = link.get("href")
flinkfile.writelines(flink)
flinkfile.close()
The result is a txt file with only one line like this:
https://quizlet.com/146113318/unit31-flash-cards/
Why is that?

The issue is that the file close is inside the for i in range(0,2) loop. If you move it out you should get more lines (assuming there are more):
# encoding = utf-8
from bs4 import BeautifulSoup
import urllib
import re
import os
url = []
urlbase = "https://quizlet.com/subject/四级乱序/page/"
flinkfile = open("links.txt",'wb')
for i in range(0,2):
url.append(urlbase + str(i+1))
indexname = str(url[i])[-1] + ".html"
urllib.urlretrieve(url[i], indexname)
print indexname + " downloaded"
f = open(indexname,"r")
soup = BeautifulSoup(f, "html.parser")
linkclass = soup.find_all("a", attrs={"class":"SetPreview-link", "href":re.compile(r"unit(\s\w+)?")})
for link in linkclass:
flink = link.get("href")
flinkfile.writelines(flink)
# close file outside loop
flinkfile.close()
To ensure the file is closed even if an error occurs, use the with statement:
with open(...) as flinkfile:
for in in range(0,2):
...
More info on with here: http://effbot.org/zone/python-with-statement.htm

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why is Beautiful Soup Returning duplicate results? - python

Related

Simple Web Scraper returning no data

How can I remove the p tag of the data being grabbed

Web Scrape - Span Price issue. Printing the last items price in loop instead of price for each item

Web Scraping - ResultSet object has no attribute 'findAll'

Python 2.7 Why I only write one line of a list to file instead of the whole list

Categories

Resources