I'm pretty new to Python and mainly need it for getting information from website.
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.example.com'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'class': 'c5'}):
href = link.get('href')
time.sleep(0.3)
# print(href)
single_item(href)
page += 1
def single_item(item_url):
s_code = requests.get(item_url)
p_text = s_code.text
soup = BeautifulSoup(p_text, "html.parser")
upc = ('div', {'class': 'product-upc'})
for upc in soup.findAll('span', {'class': 'upcNum'}):
print(upc.string)
sku = ('span', {'data-selenium': 'bhSku'})
for sku in soup.findAll('span', {'class': 'fs16 c28'}):
print(sku.text)
price = ('span', {'class': 'price'})
for price in soup.findAll('meta', {'itemprop': 'price'}):
print(price)
outFile = open(r'C:\Users\abc.txt', 'a')
outFile.write(str(upc))
outFile.write("\n")
outFile.write(str(sku))
outFile.write("\n")
outFile.write(str(price))
outFile.write('\n')
outFile.close()
spider(1)
What i want to get is "UPC:813066012487, price:26.45 and SKU:KBPTMCC2" without any span, meta or content attributes.I attached my output below
Here is my output:
screenshot
Where do i do wrong ?
Hope someone can figure it out! Thanks!!
The data you want is in the div attribute data-itemdata, you can call json.loads and it will give you a dict that you can access to get what you want:
from bs4 import BeautifulSoup
import requests
import json
soup = BeautifulSoup(requests.get("https://www.bhphotovideo.com/c/buy/accessories/ipp/100/mnp/25/Ns/p_PRICE_2%7c0/ci/20861/pn/1/N/4005352853+35").content, "html.parser")
for d in soup.select("div[data-selenium=itemDetail]"):
data = json.loads(d["data-itemdata"])
print(data)
Each data dict will look like:
{u'catagoryId': u'20861',
u'inCart': False,
u'inWish': False,
u'is': u'REG',
u'itemCode': u'KBPTMCC2',
u'li': [],
u'price': u'26.45',
u'searchTerm': u'',
u'sku': u'890522'}
So just access by key i.e price = data["price"].
To get the UPC we just need to visit the items page, we can get the url from h3 with the data-selenium attribute:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum").text.strip()
data = json.loads(d["data-itemdata"])
Not all pages have a UPC value so you will have to decide what to do, if you just want products with UPC's first check if the select finds anything:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum")
if upc:
data = json.loads(d["data-itemdata"])
text = (upc.text.strip()
Related
import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks
Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.
If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.
You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.
i am trying to use bs to scrape resume on indeed but i met some problems
here is the sample site: https://www.indeed.com/resumes?q=java&l=&cb=jt
here is my code:
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app-link'}):
job.append(a['title'])
return(job)
scrape_job_title(soup)
it print out nothing: []
As you can see in the picture, I want to grab the job title "Java developer".
The class is app_link, not app-link. Additionally, a['title'] doesn't do what you want. Use a.contents[0] instead.
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')
def scrape_job_title(soup):
job = []
for div in soup.find_all(name='li', attrs={'class':'sre'}):
for a in div.find_all(name='a', attrs={'class':'app_link'}):
job.append(a.contents[0])
return(job)
scrape_job_title(soup)
Try this to get all the job titles:
import requests
from bs4 import BeautifulSoup
URL = "https://www.indeed.com/resumes?q=java&l=&cb=jt"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html5lib')
for items in soup.select('.sre'):
data = [item.text for item in items.select('.app_link')]
print(data)
I have this soup:
The webpage has references of companies in a grid view (16 rows x 5 columns) and I want to retrieve each reference's url and the title. The problem is that all 5 references in each row, are in one class named row and when I'm scraping the page, I can only see the first reference of every row, instead of all 5 of them. Here is my code so far:
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
try:
title = entry.find('img').get('title')
url = entry.a['href']
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
Is there a way to fix this?
I think you should iterate over the "img" or over the "a".
You can write something like this:
for entry in info_block:
try:
for a in entry.find_all("a"):
title = a.find('img').get('title')
url = a.get('href')
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
import pandas as pd
from bs4 import BeautifulSoup
import requests
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
anchors = entry.find_all("a")
for a in anchors:
try:
title = a.find('img').get('title')
url = a['href']
# urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
I am trying to create a simple Web Crawler in Python, and when I'm running it it's showing no errors but it's also not printing any results as intended.
I've put my current code below, could anyone please point me in the direction of the problem?
import requests
from bs4 import BeautifulSoup
def stepashka_spider(max_pages):
page = 1
while page <= max_pages:
url = "http://online.stepashka.com/filmy/#/page/" + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for resoult in soup.findAll("a", {"class": "video-title"}):
href = resoult.get(href)
print(href)
page += 1
stepashka_spider(1)
"video-title" is in a div tag, you also need to pass a string "href":
def stepashka_spider(max_pages):
page = 1
while page <= max_pages:
url = "http://online.stepashka.com/filmy/#/page/" + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for resoult in soup.findAll("div", {"class": "video-title"}):
a_tag = resoult.a
print(a_tag["href"])
page += 1
stepashka_spider(1)
Output:
http://online.stepashka.com/filmy/komedii/37878-klub-grust.html
http://online.stepashka.com/filmy/dramy/37875-kadr.html
http://online.stepashka.com/filmy/multfilmy/37874-betmen-protiv-robina.html
http://online.stepashka.com/filmy/fantastika/37263-hrustalnye-cherepa.html
http://online.stepashka.com/filmy/dramy/34369-bozhiy-syn.html
http://online.stepashka.com/filmy/trillery/37873-horoshee-ubiystvo.html
http://online.stepashka.com/filmy/trillery/34983-zateryannaya-reka.html
http://online.stepashka.com/filmy/priklucheniya/37871-totem-volka.html
http://online.stepashka.com/filmy/fantastika/35224-zheleznaya-shvatka.html
http://online.stepashka.com/filmy/dramy/37870-bercy.html
You are actually using the wrong url format, we can also use range instead of a loop:
def stepashka_spider(max_pages):
for page in range(1,max_pages+1):
url = "http://online.stepashka.com/filmy/page/{}/".format(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
print("Movies for page {}".format(page))
for resoult in soup.findAll("div", {"class": "video-title"}):
a_tag = resoult.a
print(a_tag["href"])
print()
Output:
Movies for page 1
http://online.stepashka.com/filmy/dramy/37895-raskop.html
http://online.stepashka.com/filmy/semejnyj/36275-domik-v-serdce.html
http://online.stepashka.com/filmy/dramy/35371-enni.html
http://online.stepashka.com/filmy/trillery/37729-igra-na-vyzhivanie.html
http://online.stepashka.com/filmy/trillery/37893-vosstavshie-mertvecy.html
http://online.stepashka.com/filmy/semejnyj/30104-sedmoy-syn-seventh-son-2013-treyler.html
http://online.stepashka.com/filmy/dramy/37892-sekret-schastya.html
http://online.stepashka.com/filmy/uzhasy/37891-davayte-poohotimsya.html
http://online.stepashka.com/filmy/multfilmy/3404-specagent-archer-archer-archer-2010-2013.html
http://online.stepashka.com/filmy/trillery/37334-posledniy-reys.html
Movies for page 2
http://online.stepashka.com/filmy/komedii/37890-top-5.html
http://online.stepashka.com/filmy/komedii/37889-igra-v-doktora.html
http://online.stepashka.com/filmy/dramy/36651-vrozhdennyy-porok.html
http://online.stepashka.com/filmy/komedii/37786-superforsazh.html
http://online.stepashka.com/filmy/fantastika/35003-voshozhdenie-yupiter.html
http://online.stepashka.com/filmy/sport/37888-ufc-on-fox-15-machida-vs-rockhold.html
http://online.stepashka.com/filmy/semejnyj/37558-prizrak.html
http://online.stepashka.com/filmy/boeviki/36865-mordekay.html
http://online.stepashka.com/filmy/dramy/37884-stanovlenie-legendy.html
http://online.stepashka.com/filmy/trillery/37883-tainstvo.html
Movies for page 3
http://online.stepashka.com/filmy/dramy/37551-nochnoy-beglec.html
http://online.stepashka.com/filmy/dramy/37763-mech-drakona.html
http://online.stepashka.com/filmy/trillery/36471-paren-po-sosedstvu.html
http://online.stepashka.com/filmy/dramy/36652-amerikanskiy-snayper.html
http://online.stepashka.com/filmy/dramy/37555-feniks.html
http://online.stepashka.com/filmy/semejnyj/35156-gnezdo-drakona-vosstanie-chernogo-drakona.html
http://online.stepashka.com/filmy/kriminal/37882-ch-b.html
http://online.stepashka.com/filmy/priklucheniya/37881-admiral-bitva-za-men-ryan.html
http://online.stepashka.com/filmy/trillery/37880-malyshka.html
http://online.stepashka.com/filmy/trillery/36417-poteryannyy-ray.html
I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone
You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]