I have this soup:
The webpage has references of companies in a grid view (16 rows x 5 columns) and I want to retrieve each reference's url and the title. The problem is that all 5 references in each row, are in one class named row and when I'm scraping the page, I can only see the first reference of every row, instead of all 5 of them. Here is my code so far:
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
try:
title = entry.find('img').get('title')
url = entry.a['href']
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
Is there a way to fix this?
I think you should iterate over the "img" or over the "a".
You can write something like this:
for entry in info_block:
try:
for a in entry.find_all("a"):
title = a.find('img').get('title')
url = a.get('href')
urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
import pandas as pd
from bs4 import BeautifulSoup
import requests
url = 'http://www.slimstock.com/nl/referenties/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
info_block = soup.find_all("div", attrs={"class": "row"})
references = pd.DataFrame(columns=['Company Name', 'Web Page'])
for entry in info_block:
anchors = entry.find_all("a")
for a in anchors:
try:
title = a.find('img').get('title')
url = a['href']
# urlcontent = BeautifulSoup(requests.get(url).content, "lxml")
row = [{'Company Name': title, 'Web Page': url}]
references = references.append(row, ignore_index=True)
except:
pass
Related
I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File
You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')
I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)
I'm pretty new to Python and mainly need it for getting information from website.
def spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.example.com'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'class': 'c5'}):
href = link.get('href')
time.sleep(0.3)
# print(href)
single_item(href)
page += 1
def single_item(item_url):
s_code = requests.get(item_url)
p_text = s_code.text
soup = BeautifulSoup(p_text, "html.parser")
upc = ('div', {'class': 'product-upc'})
for upc in soup.findAll('span', {'class': 'upcNum'}):
print(upc.string)
sku = ('span', {'data-selenium': 'bhSku'})
for sku in soup.findAll('span', {'class': 'fs16 c28'}):
print(sku.text)
price = ('span', {'class': 'price'})
for price in soup.findAll('meta', {'itemprop': 'price'}):
print(price)
outFile = open(r'C:\Users\abc.txt', 'a')
outFile.write(str(upc))
outFile.write("\n")
outFile.write(str(sku))
outFile.write("\n")
outFile.write(str(price))
outFile.write('\n')
outFile.close()
spider(1)
What i want to get is "UPC:813066012487, price:26.45 and SKU:KBPTMCC2" without any span, meta or content attributes.I attached my output below
Here is my output:
screenshot
Where do i do wrong ?
Hope someone can figure it out! Thanks!!
The data you want is in the div attribute data-itemdata, you can call json.loads and it will give you a dict that you can access to get what you want:
from bs4 import BeautifulSoup
import requests
import json
soup = BeautifulSoup(requests.get("https://www.bhphotovideo.com/c/buy/accessories/ipp/100/mnp/25/Ns/p_PRICE_2%7c0/ci/20861/pn/1/N/4005352853+35").content, "html.parser")
for d in soup.select("div[data-selenium=itemDetail]"):
data = json.loads(d["data-itemdata"])
print(data)
Each data dict will look like:
{u'catagoryId': u'20861',
u'inCart': False,
u'inWish': False,
u'is': u'REG',
u'itemCode': u'KBPTMCC2',
u'li': [],
u'price': u'26.45',
u'searchTerm': u'',
u'sku': u'890522'}
So just access by key i.e price = data["price"].
To get the UPC we just need to visit the items page, we can get the url from h3 with the data-selenium attribute:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum").text.strip()
data = json.loads(d["data-itemdata"])
Not all pages have a UPC value so you will have to decide what to do, if you just want products with UPC's first check if the select finds anything:
for d in soup.select("div[data-selenium=itemDetail]"):
url = d.select_one("h3[data-selenium] a")["href"]
upc = BeautifulSoup(requests.get(url).content, "html.parser").select_one("span.upcNum")
if upc:
data = json.loads(d["data-itemdata"])
text = (upc.text.strip()
I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone
You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]
I want to scrape a web to gather the data for studying data mining. This web data contains a big table with 43 pages. And it also hide some stocks at the most right hand side of the expand menu.
The web page is below.
http://data.10jqka.com.cn/market/longhu/yyb/
import bs4
import requests
url = r"http://data.10jqka.com.cn/market/longhu/yyb/"
response = requests.get(url)
if response.status_code == 200:
content = response.content
soup = bs4.BeautifulSoup(content)
table_results = soup.findAll("table", {"class": "m_table"})
for item in table_results:
company_name = item.findAll("td", {"class": "tl"})[0].text.strip()
detail = item.findAll("td", {"class": "tc"})[0].text.strip()
c_rise = item.findAll("td", {"class": "c_rise"})[0].text.strip()
c_fall = item.findAll("td", {"class": "c_fall"})[0].text.strip()
cur = item.findAll("td", {"class": "cur"})[0].text.strip()
lhb_stocklist = item.findAll("div", {"class": "lhb_stocklist"})[0].text.strip()
print company_name, detail, c_rise, c_fall, lhb_stocklist
A solution based on requests, BeautifulSoup, and lxml:
import json
import requests
from bs4 import BeautifulSoup
URL = 'http://data.10jqka.com.cn/interface/market/longhuyyb/stocknum/desc/%d/20'
# config end_page as needed, or parse http://data.10jqka.com.cn/market/longhu/yyb/ to make it auto adapted
end_page = 2
result = []
for page_idx in range(1, end_page + 1):
print 'Extracting page', page_idx
raw_response = requests.get(URL % page_idx)
page_content = json.loads(raw_response.text)['data']
html = BeautifulSoup(page_content, 'lxml')
for row in html.tbody.find_all('tr'):
company = row.find(class_='tl').text
detail_link = row.find(class_='tl').a['href']
buy = float(row.find(class_='c_rise').text)
sell = float(row.find(class_='c_fall').text)
stock_cnt = int(row.find(class_='cur').text)
stocks = []
for a in row.find(class_='lhb_stocklist_box hide').p.find_all('a'):
stocks.append((a.text, a['href']))
result.append({
'company': company,
'detail_link': detail_link,
'buy': buy,
'sell': sell,
'stock_cnt': stock_cnt,
'stocks': stocks,
})
print 'Company number:', len(result)
I put all data into a list of dictionaries, for easy accessing. You can modify the codes to directly write to a CSV or whatever