web-scrape: get H4 attributes & href

web-scrape: get H4 attributes & href - python

I am trying to web-scrape a website. But I can get access to the attributes of some fields.
here is the code i used:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
scrap_list = pd.DataFrame()
for path in range(10): # scroll over the categories
for path in range(10): # scroll over the pages
url = 'https://www.samehgroup.com/index.php?route=product/category'+str(page)+'&'+'path='+ str(path)
req = urllib3.PoolManager()
res = req.request('GET', URL)
soup = BeautifulSoup(res.data, 'html.parser')
soup.findAll('h4', {'class': 'caption'})
# extract names
scrap_name = [i.text.strip() for i in soup.findAll('h2', {'class': 'caption'})]
scrap_list['product_name']=pd.DataFrame(scrap_name,columns =['Item_name'])
# extract prices
scrap_list['product_price'] = [i.text.strip() for i in soup.findAll('div', {'class': 'price'})]
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
I want an output that provides me with each product and its price. I still can't get that right.
Any help would be very much appreciated.

I think the problem here was looping through the website pages. I got the code below working by first making a list of urls containing numbered 'paths' corresponding to pages on the website. Then looping through this list and applying a page number to the url.
If you wanted to only get all the products from a certain page, this page can be selected from the urlist and by index.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
urlist = [] #create list of usable url's to iterate through,
for i in range(1,10): # 9 pages equal to pages on website
urlist.append('https://www.samehgroup.com/index.php?route=product/category&path=' + str(i))
namelist = []
newprice = []
for urlunf in urlist: #first loop to get 'path'
for n in range(100): #second loop to get 'pages'. set at 100 to cover website max page at 93
try: #try catches when pages containing products run out.
url = urlunf + '&page=' + str(n)
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
products = soup.find_all('div', class_='caption')
for prod in products: #loops over returned list of products for names and prices
name = prod.find('h4').text
newp = prod.find('p', class_='price').find('span', class_='price-new').text
namelist.append(name) #append data to list outside of loop
newprice.append(newp)
time.sleep(2)
except AttributeError: #if there are no more products it will move to next page
pass
df = pd.DataFrame() #create df and add scraped data
df['name'] = namelist
df['price'] = newprice

Related

Trying to get data from a table using beautifulsoup in python

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')

To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:

Python issue for crawling multiple page title

I am a marketer and want to conduct some basic market research using Python.
I wrote a simple coding to crawl multiple pages of title, but it does not work to put the title text in the list and to transfer it into Excel format. How can I do in this case?
I tried to create a list and used the extend() method to put these looped titles on the list, but it did not work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
def content_get(url):
count = 0
while count < 4: #this case was to crawl titles of 4 pages
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
titles = soup.find(id="main-container").find_all("div", class_="r-ent")
for title in titles:
print([title.find('div', class_='title').text])
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
firstpage = "https://www.ptt.cc/bbs/movie/index9002.html"
content_get(firstpage)

You need to add the titles to a list outside of the while loop:
def content_get(url):
count = 0
titles = []
while count < 4:
r = requests.get(url)
soup = BeautifulSoup(r.text)
title_page = [title.text.replace('\n', '') for title in soup.find_all('div', {'class': 'title'})]
titles.extend(title_page)
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
return titles
If you don't want the list comprehension to get titles_page, that can be replaced with a traditional for loop:
titles_page = []
titles = soup.find_all('div', {'class': 'title'})
for title in titles:
titles_page.append(title.text.replace('\n', ''))
For the excel file:
def to_excel(text):
df = pd.DataFrame(text, columns=['Title'])
return df.to_excel('output.xlsx')

Use Beatiful Soup in scraping multiple websites

I want to know why lists all_links and all_titles don't want to receive any records from lists titles and links. I have tried also .extend() method and it didn't help.
import requests
from bs4 import BeautifulSoup
all_links = []
all_titles = []
def title_link(page_num):
page = requests.get(
'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d'
% (page_num, page_num, page_num))
soup = BeautifulSoup(page.content, 'html.parser')
links = ['https://www.gumtree.pl' + link.get('href')
for link in soup.find_all('a', class_ ="href-link tile-title-text")]
titles = [flat.next_element for flat in soup.find_all('a', class_ = "href-link tile-title-text")]
print(titles)
for i in range(1,5+1):
title_link(i)
all_links = all_links + links
all_titles = all_titles + titles
i+=1
print(all_links)
import pandas as pd
df = pd.DataFrame(data = {'title': all_titles ,'link': all_links})
df.head(100)
#df.to_csv("./gumtree_page_1.csv", sep=';',index=False, encoding = 'utf-8')
#df.to_excel('./gumtree_page_1.xlsx')

When I ran your code, I got
NameError Traceback (most recent call last)
<ipython-input-3-6fff0b33d73b> in <module>
16 for i in range(1,5+1):
17 title_link(i)
---> 18 all_links = all_links + links
19 all_titles = all_titles + titles
20 i+=1
NameError: name 'links' is not defined
That points to a problem - variable named links is not defined in a global scope (where you add it to all_links). You can read about python scopes here. You'd need to return links and titles from title_link. Something similar to this:
def title_link(page_sum):
# your code here
return links, titles
for i in range(1,5+1):
links, titles = title_link(i)
all_links = all_links + links
all_titles = all_titles + titles
print(all_links)

This code is exhibits confusion about scoping. titles and links inside of title_link are local to that function. When the function ends, the data disappears and it cannot be accessed from another scope such as main. Use the return keyword to return values from functions. In this case, you'd need to return a tuple pair of titles and links like return titles, links.
Since functions should do one task only, having to return a pair shows reveals a possible design flaw. A function like title_link is overloaded and should probably be two separate functions, one to get titles and one to get links.
Having said that, the functions here seem like premature abstractions since the operations can be done directly.
Here's a suggested rewrite:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d"
data = {"title": [], "link": []}
for i in range(1, 6):
page = requests.get(url % (i, i, i))
soup = BeautifulSoup(page.content, "html.parser")
titles = soup.find_all("a", class_="href-link tile-title-text")
data["title"].extend([x.next_element for x in titles])
data["link"].extend("https://www.gumtree.pl" + x.get("href") for x in titles)
df = pd.DataFrame(data)
print(df.head(100))
Other remarks:
i+=1 is unnecessary; for loops move forward automatically in Python.
(1,5+1) is clearer as (1, 6).
List comprehensions are great, but if they run multiple lines, consider writing them as normal loops or creating an intermediate variable or two.
Imports should be at the top of a file only. See PEP-8.
list.extend(other_list) is preferable to list = list + other_list, which is slow and memory-intensive, creating a whole copy of the list.

Try this:
import requests
from bs4 import BeautifulSoup
all_links = []
all_titles = []
def title_link(page_num):
page = requests.get(
'https://www.gumtree.pl/s-mieszkania-i-domy-sprzedam-i-kupie/warszawa/page-%d/v%dc9073l3200008p%d'
% (page_num, page_num, page_num))
page.encoding = 'utf-8'
soup = BeautifulSoup(page.content, 'html.parser', from_encoding='utf-8')
links = ['https://www.gumtree.pl' + link.get('href')
for link in soup.find_all('a', class_ ="href-link tile-title-text")]
titles = [flat.next_element for flat in soup.find_all('a', class_ = "href-link tile-title-text")]
print(titles)
return links, titles
for i in range(1,5+1):
links, titles = title_link(i)
all_links.extend(links)
all_titles.extend(titles)
# i+=1 not needed in python
print(all_links)
import pandas as pd
df = pd.DataFrame(data = {'title': all_titles ,'link': all_links})
df.head(100)
I think you just needed to get links and titles out of title_link(page_num).
Edit: removed the manual incrementing per comments
Edit: changed the all_links = all_links + links to all_links.extend(links)
Edit: website is utf-8 encoded, added page.encoding = 'utf-8' and as extra (probably unnecessary) measure, from_encoding='utf-8' to the BeautifulSoup

How can scrape prices from next pages?

I'm new to python and web scraping.
I wrote some codes by using requests and beautifulsoup. One code is for scraping prices and names and links. Which works fine and is as follows:
from bs4 import BeautifulSoup
import requests
urls = "https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html#/pagesize-24/order-new/stock-1/page-1"
source = requests.get(urls).text
soup = BeautifulSoup(source, 'lxml')
for figcaption in soup.find_all('figcaption'):
price = figcaption.div.text
name = figcaption.find('a', class_='title').text
link = figcaption.find('a', class_='title')['href']
print(price)
print(name)
print(link)
and also one for making other urls that I need those information scraped from, which also gives the correct urls when I use print():
x = 0
counter = 1
for x in range(0, 70)
urls = "https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html#/pagesize-24/order-new/stock-1/page-" + str(counter)
counter += 1
x += 1
print(urls)
But when I try to combine these two in order to scrape a page and then change url to new one and then scrape it, it just gives the scraped information on the first page 70 times. please guide me through this. the whole code is as follows:
from bs4 import BeautifulSoup
import requests
x = 0
counter = 1
for x in range(0, 70):
urls = "https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html#/pagesize-24/order-new/stock-1/page-" + str(counter)
source = requests.get(urls).text
soup = BeautifulSoup(source, 'lxml')
counter += 1
x += 1
print(urls)
for figcaption in soup.find_all('figcaption'):
price = figcaption.div.text
name = figcaption.find('a', class_='title').text
link = figcaption.find('a', class_='title')['href']
print(price)
print()
print(name)
print()
print(link)

Your x=0 and then incriminating it by 1 is redundant and not needed, as you have it iterating through that range range(0, 70). I'm also not sure why you have a counter as you don't need that either. Here's how you would do it below:
HOWEVER, I believe that issue is not with the iteration or looping, but the url itself. If you manually go to the two pages as listed below, the content doesn’t change:
https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html#/pagesize-24/order-new/stock-1/page-1
and then
https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html#/pagesize-24/order-new/stock-1/page-2
Since the site is dynamic, you'll need to find a different way to iterate page to page, or figure out what the exact url is. So try:
from bs4 import BeautifulSoup
import requests
for x in range(0, 70):
try:
urls = 'https://www.meisamatr.com/fa/product/cat/2-%D8%A2%D8%B1%D8%A7%DB%8C%D8%B4%DB%8C.html&pagesize[]=24&order[]=new&stock[]=1&page[]=' +str(x+1) + '&ajax=ok?_=1561559181560'
source = requests.get(urls).text
soup = BeautifulSoup(source, 'lxml')
print('Page: %s' %(x+1))
for figcaption in soup.find_all('figcaption'):
price = figcaption.find('span', {'class':'new_price'}).text.strip()
name = figcaption.find('a', class_='title').text
link = figcaption.find('a', class_='title')['href']
print('%s\n%s\n%s' %(price, name, link))
except:
break
You can find that link by going to the website and looking at the dev tools (Ctrl +Shift+I or right-click 'Inspect') -> network -> XHR
When I did that and then physically click to the next page, I can see how that data was rendered, and found the reference url.

Index Error while Web Scraping

this is my code -
# coding: utf-8
# ## Extracting just the links from the Security home page
# In[126]:
base_url = "https://www.cnet.com"
additional_url = "/topics/security/how-to/"
import re
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
# To keep a count of the number of articles to be scrapped
limit = 0;
next_page = base_url + additional_url
# List to store the links
list_of_links = []
# Change the limit as per requirements
while next_page and limit <= 200:
temp_list_of_links = []
# Load and extract the content of the page
page = requests.get(next_page)
#sleep(15)
soup = BeautifulSoup(page.content, 'html.parser')
# Find the 'news' links of the page
for link in soup.find_all('a', href=True):
if link['href'].startswith('/news/'):
temp_list_of_links.append(link['href'])
# Save the unique links
link_list = set(temp_list_of_links)
# Find the length of the list of unique links
length = len(link_list)
#print(length)
# Add the links to the final list
list_of_links.extend(link_list)
#sleep(120)
# Increment the limit
limit = limit + length
# Find the links of the Show More page
next_page = soup.find('a', class_='load-more')
# Change the href to the Show More page link
if next_page :
next_page = base_url + next_page['href']
# In[127]:
# Final list with unique links
link_list = set(list_of_links)
# Remove the lone '/news'/ link
link_list.remove('/news/')
# Converting the set into a list
link_list = list(link_list)
# ## Extracting the data from each link
# In[128]:
all_articles = []
for item in link_list:
new_page = base_url + item
page = requests.get(new_page)
soup = BeautifulSoup(page.content, 'html.parser')
sleep(120)
article = []
article_title = soup.title.text
article.append(article_title)
#print(soup.prettify())
article_content = []
content = soup.find("div", {"class":"col-7 article-main-body row"}).findAll('p')
# Writing the content found in the list in its text form
for item in content:
article_content.append(item.text)
# Joining the list elements to form a proper paragraph
article_content = " ".join(article_content)
article.append(article_content)
all_articles.append(article)
# In[129]:
import pandas as pd
df = pd.DataFrame()
df = df.append(all_articles)
df.to_csv('cnet.csv',encoding='utf-8')
# In[1181]:
And this is my error -
Traceback (most recent call last):
File "Gopika_CNET.py", line 113, in <module>
df = df.append(all_articles)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py", line 4634, in append
elif isinstance(other, list) and not isinstance(other[0], DataFrame):
IndexError: list index out of range

Seems like a bug in pandas. If you try to append an empty list to a dataframe, you will get this error.
This will raise the same error. So it's not a bug in your code.
pandas.DataFrame().append([])
Make sure that your all_articles is not an empty list, and you will not get this error.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

web-scrape: get H4 attributes & href - python

Related

Trying to get data from a table using beautifulsoup in python

Python issue for crawling multiple page title

Use Beatiful Soup in scraping multiple websites

How can scrape prices from next pages?

Index Error while Web Scraping

Categories

Resources