I want to scrape the table data from http://5000best.com/websites/
The content of the table is paginated upto several pages and are dynamic.
I want to scrape the table data for each category. I can scrape the table manually for each category but this is not what I want.
Please look at it and give me the approach to do it.
I am able to make links for each category i.e. http://5000best.com/websites/Movies/, http://5000best.com/websites/Games/ etc.
But I am not sure how to make it further to navigate through paginated table for each category.
And after making all the links I need to extract table data by using that links.
Edit : I am using requests, BeautifulSoup4,
Simple Scrapy spider:
import scrapy
class Best500Spider(scrapy.Spider):
name = "best5000"
start_urls = ['http://5000best.com/websites/1']
def parse(self, response):
for row in response.xpath('//table[#id="ttable"]//tr'):
record = {}
record["Rank"] = row.xpath('./td[1]/text()').get()
record["Score"] = row.xpath('./td[2]/text()').get()
record["Category"] = row.xpath('string(./td[3])').get()
record["URL"] = row.xpath('string(./td[5])').get()
yield record
next_page_url = response.xpath('//div[#id="dpages"]/span[#class="pagen0"]/following-sibling::span[1]/a/#href').get()
if next_page_url:
yield scrapy.Request(
url=response.urljoin(next_page_url),
callback=self.parse
)
i saw the site and to move to another page just add /pageNumber at the end of the link ,
For example
http://5000best.com/websites/50 : will get you page 50
You can use this tool to get python requests code for one page and add a loop : https://curl.trillworks.com/
just put "curl http://5000best.com/websites/50" and adapt your code after
I came with this approach to scrape tables from each category.
# ------------Hemant Sah--------------------
# <- --------Importing Libraries-------- ->
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math
import itertools
import requests
import sqlalchemy
import re
final_list = []
dataframe = pd.DataFrame([])
def make_soup(url):
try:
html = requests.get(url)
except requests.exceptions.HTTPError as e:
print(e)
else:
soup = BeautifulSoup(html.text,'lxml')
# print(html.status_code)
return soup
def get_categories_from_soup(soup):
total_list = []
for item in soup.find_all('div',{"class":"sca2"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_a"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_b"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_c"}):
total_list.append(item.text)
total_list.remove("All (5000)")
total_list.remove("Porn (201)")
return total_list
def make_url(total_list,url):
path, page_num, test_page_num, modified_links, new_links = [],[],[],[],[]
for category in total_list:
reg_exp_path = re.compile(r'^\w+')
path.extend(reg_exp_path.findall(category))
test_page_num.extend(re.findall('[0-9]+',category))
# print(path)
for c in test_page_num:
temp = math.ceil(int(c)/100)
page_num.append(temp)
# print(page_num)
# print(page_num)
for p in path:
links= (url+p+"/")
modified_links.append(links)
# print(modified_links)
for w,p in zip(modified_links,page_num):
for n in range(1,p+1):
temp = w+str(n)
new_links.append(temp)
print(new_links)
return new_links
def fetch_table_data(links):
for l in links:
soup = make_soup(l)
my_table = soup.find('table',{'id':'ttable'})
rows = my_table.find_all('tr')
for tr in rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
final_list.append(row)
df = pd.DataFrame(final_list,columns=["Rank","Score","Category","Audience","URL","Links","blank","Desc"])
print(df)
df = df.drop("blank", axis=1)
# print(df)
return df
# df.to_csv('final_data.csv')
def main():
url = "http://5000best.com/websites/"
soup = make_soup(url)
total_list = get_categories_from_soup(soup)
links = make_url(total_list, url)
dataframe = fetch_table_data(links)
if __name__ == "__main__":
main()
Related
I am trying to web-scrape a website. But I can get access to the attributes of some fields.
here is the code i used:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
scrap_list = pd.DataFrame()
for path in range(10): # scroll over the categories
for path in range(10): # scroll over the pages
url = 'https://www.samehgroup.com/index.php?route=product/category'+str(page)+'&'+'path='+ str(path)
req = urllib3.PoolManager()
res = req.request('GET', URL)
soup = BeautifulSoup(res.data, 'html.parser')
soup.findAll('h4', {'class': 'caption'})
# extract names
scrap_name = [i.text.strip() for i in soup.findAll('h2', {'class': 'caption'})]
scrap_list['product_name']=pd.DataFrame(scrap_name,columns =['Item_name'])
# extract prices
scrap_list['product_price'] = [i.text.strip() for i in soup.findAll('div', {'class': 'price'})]
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
I want an output that provides me with each product and its price. I still can't get that right.
Any help would be very much appreciated.
I think the problem here was looping through the website pages. I got the code below working by first making a list of urls containing numbered 'paths' corresponding to pages on the website. Then looping through this list and applying a page number to the url.
If you wanted to only get all the products from a certain page, this page can be selected from the urlist and by index.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
urlist = [] #create list of usable url's to iterate through,
for i in range(1,10): # 9 pages equal to pages on website
urlist.append('https://www.samehgroup.com/index.php?route=product/category&path=' + str(i))
namelist = []
newprice = []
for urlunf in urlist: #first loop to get 'path'
for n in range(100): #second loop to get 'pages'. set at 100 to cover website max page at 93
try: #try catches when pages containing products run out.
url = urlunf + '&page=' + str(n)
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
products = soup.find_all('div', class_='caption')
for prod in products: #loops over returned list of products for names and prices
name = prod.find('h4').text
newp = prod.find('p', class_='price').find('span', class_='price-new').text
namelist.append(name) #append data to list outside of loop
newprice.append(newp)
time.sleep(2)
except AttributeError: #if there are no more products it will move to next page
pass
df = pd.DataFrame() #create df and add scraped data
df['name'] = namelist
df['price'] = newprice
I'm trying to first crawl through the main page of this website for the links to a table for each year. Then I'd like to scrape each site, while maintaining record of each year.
So far I have my spider constructed as:
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
I would then like to go through each link in this file and crawl through them, using the following logic:
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":
[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]
}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
My problem is that I couldn't find a way to do this effectively. Calling scrapy.Request on the url returns a response with just the content <html></html>. If there was a way where the response object could resemble the one given by the fetch command in Scrapy shell that would be ideal, since I've based the selection logic on testing with that command.
Edit:
Here's the entire spider so far
The idea is the run the first for loop to get the link and then the second for loop to extract the tables from said links.
import scrapy
import regex as re
from scrapy.http import HtmlResponse
import w3lib.html
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
for date,url in splits.items():
resp = HtmlResponse(url)
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
yield {
'Date': date,
'Scores': data_dict}
Initializing a HtmlResponse(url) doesn't accomplish anything, since the class doesn't make the request itself.
To add a request to scrapy's scheduler, you need to yield one, eg: yield scrapy.Request(url, callback=self.parse).
That being said, there are many improvements you can make to your spider.
Use scrapy's builtin LinkExtractor instead of string splitting
use css selectors instead of the hardcoded xpaths
use selector.root.text instead of w3lib.remove_tags (to remove the dependency entirely)
Here is a working example:
import scrapy
from scrapy.linkextractors import LinkExtractor
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
le = LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths='//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div',
)
for link in le.extract_links(response):
yield scrapy.Request(
url=link.url,
callback=self.parse_table,
cb_kwargs={ 'date': link.text },
)
def parse_table(self, response, date):
rows = response.css('#content table tbody tr')
if not rows:
print(f'No table found for url: {response.url}')
return
category = [char.root.text for char in rows[0].css('td strong')[1:]]
if not category:
category = [char.root.text for char in rows[0].css('td')[1:]]
for row in rows[1:]:
cols = row.css('td')
title = cols[0].root.text
nums = [col.root.text for col in cols[1:]]
yield {
'Date': date,
'Category': category,
title: nums
}
Note that your category parsing doesn't appear to work. I'm not exactly sure what you are trying to extract, so I'll leave that one for you.
Please help me with the below code. I want to print it in table format with total rows = 35, columns = 6.
from bs4 import BeautifulSoup
import requests
#import urllib.request
from tabulate import tabulate
from selenium import webdriver # for webdriver
from selenium.webdriver.chrome.options import Options # for suppressing the browser
class States():
def __init__(self):
url = "https://www.mohfw.gov.in/"
# self.res = requests.get(url)
# self.soup = BeautifulSoup(self.res.text, 'lxml')
self.op = webdriver.ChromeOptions()
self.op.add_argument('headless')
self.driver = webdriver.Chrome(executable_path= "C:\web drivers\drivers\chromedriver_win32\chromedriver.exe", options= self.op)
self.driver.get(url)
self.driver.find_element_by_class_name("open-table").click()
def get_data(self):
print("S.No" "Name of State / UT" "Active Cases*" "Cured/Discharged/Migrated*" "Deaths**" "Total Confirmed cases*")
self.base_table = self.driver.find_element_by_tag_name("table")
table_row = 35
table_cols = 6
for i in range(1, table_row +1):
for j in range(1, table_cols +1):
print(self.base_table.find_element_by_xpath("//*[#id='state-data']/div/div/div/div/table/tbody/tr[" +str(i)+"]/td[" + str(j) + "]").text)
state=States()
state.get_data()
Could you please provide the url for better understanding? If you are particularly looking to scrape a table data from web, the best way to look use BeautifulSoup. identify the class name and you can simply loop through the row and individual data. have a look at the following snippet
from bs4 import BeautifulSoup
import requests
url = "https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=JYVEVKT1J5S5HQZEVYN1&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=boxoffice&ref_=chtbo_india_tr_rhs_1"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
movie_rating = []
movie_name = []
#identifying the table using class name
imdb_table = soup.find('table', class_ = 'chart full-width')
for imdb in imdb_table.find_all('tbody'):
#find all rows together
rows = imdb.find_all('tr')
#simply loop through individual element
for row in rows:
name = row.find('td', class_ = 'titleColumn').text
movie_name.append(re.sub('[^A-Za-z]+', ' ',name))
rating = row.find('td', class_ = 'ratingColumn imdbRating').text
movie_rating.append(float(rating))
I'm trying to write a code to scrape some date from pages about hotels. The final information (name of the hotel and address) should be export to csv. The code works but only on one page...
import requests
import pandas as pd
from bs4 import BeautifulSoup # HTML data structure
page_url = requests.get('https://e-turysta.pl/noclegi-krakow/')
soup = BeautifulSoup(page_url.content, 'html.parser')
list = soup.find(id='nav-lista-obiektow')
items = list.find_all(class_='et-list__details flex-grow-1 d-flex d-md-block flex-column')
nazwa_noclegu = [item.find(class_='h3 et-list__details__name').get_text() for item in items]
adres_noclegu = [item.find(class_='et-list__city').get_text() for item in items]
dane = pd.DataFrame(
{
'nazwa' : nazwa_noclegu,
'adres' : adres_noclegu
}
)
print(dane)
dane.to_csv('noclegi.csv')
I tried a loop but doesn't work:
for i in range(22):
url = requests.get('https://e-turysta.pl/noclegi-krakow/'.format(i+1)).text
soup = BeautifulSoup(url, 'html.parser')
Any ideas?
Urls are different then you use - you forgot ?page=.
And you have to use {} to add value to string
url = 'https://e-turysta.pl/noclegi-krakow/?page={}'.format(i+1)
or concatenate it
url = 'https://e-turysta.pl/noclegi-krakow/?page=' + str(i+1)
or use f-string
url = f'https://e-turysta.pl/noclegi-krakow/?page={i+1}'
EDIT: working code
import requests
from bs4 import BeautifulSoup # HTML data structure
import pandas as pd
def get_page_data(number):
print('number:', number)
url = 'https://e-turysta.pl/noclegi-krakow/?page={}'.format(number)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
container = soup.find(id='nav-lista-obiektow')
items = container.find_all(class_='et-list__details flex-grow-1 d-flex d-md-block flex-column')
# better group them - so you could add default value if there is no nazwa or adres
dane = []
for item in items:
nazwa = item.find(class_='h3 et-list__details__name').get_text(strip=True)
adres = item.find(class_='et-list__city').get_text(strip=True)
dane.append([nazwa, adres])
return dane
# --- main ---
wszystkie_dane = []
for number in range(1, 23):
dane_na_stronie = get_page_data(number)
wszystkie_dane.extend(dane_na_stronie)
dane = pd.DataFrame(wszystkie_dane, columns=['nazwa', 'adres'])
dane.to_csv('noclegi.csv', index=False)
in your loop you use the .format() function but need to insert the brackets into the string you are formatting.
for i in range(22):
url = requests.get('https://e-turysta.pl/noclegi-krakow/{}'.format(i+1)).text
soup = BeautifulSoup(url, 'html.parser')
I am attempting a simple scrape of an HTML table using BeautifulSoup with the following:
import urllib
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
page = urllib.request.urlopen(url)
sdata = BeautifulSoup(page, 'html.parser')
return sdata
url = 'http://www.satp.org/satporgtp/countries/pakistan/database/bombblast.htm'
soup = make_soup(url)
table = soup.findAll('table', attrs={'class':'pagraph1'})
table = table[0]
trows = table.findAll('tr')
bbdata_ = []
bbdata = []
for trow in trows:
bbdata_ = trow.findAll('td')
bbdata = [ele.text.strip() for ele in bbdata_]
print(bbdata)
However, I can only extract the last row in the table, i.e.
['Total*', '369', '1032+']
All of the data is included in the trows, so I must be forming my loop incorrectly, but I am not sure how.
Your problem is here:
bbdata = [ele.text.strip() for ele in bbdata_]
You want to append to the list or extend it:
bbdata.append([ele.text.strip() for ele in bbdata_])
You are overwriting bbdata each time through the loop which is why it ends up only with the final value.