I need some help to crawl the data from the url below
https://onland.kbstar.com/quics?page=C060250&keyword=%EB%8F%99%EC%9E%91%EA%B5%AC
I would like to crawl the second page but the url remains the same when I click '2' and don't know how to do it. Please help!
Here's my python code to crawl the fist page :
from selenium import webdriver
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request as req
import urllib
import re
from datetime import datetime
import requests
dataframe = pd.DataFrame()
def KB_liveON(area_name):
query = area_name
area = urllib.parse.quote(query)
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
# + '#CP'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
dataframe = pd.DataFrame()
value_list = []
for tr in trs[::1]:
tds = tr.find_all('td')
#cols = [' '.join(td.text.strip().split()) for td in tds]
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(value_list, columns=cols)
return df
kb = KB_liveON('동작구')
dataframe = dataframe.append(kb)
dataframe
First, I install a Selenium WebDriver on Google Colab. Then I write a crawler to crawl data from multiple pages.
Python code:
import time
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
def extract_data(value_list, html_tags):
soup = BeautifulSoup(html_tags, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
for tr in trs[::1]:
tds = tr.find_all('td')
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
return value_list
def KB_liveON(area):
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
wd.get(url)
data_list = []
# Extract data from first page
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
# Find and extract data from other pages except first page
forms = wd.find_elements_by_xpath("//div[#class='paging']//form")
for f in forms[1:]:
f.submit()
time.sleep(10)
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
time.sleep(10)
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(data_list, columns=cols)
return df
if __name__=='__main__':
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver', options=options)
df = KB_liveON('동작구')
print (df)
Output (The crawl results):
Related
im a newbie in python, actually trying to pull off an assignment for a uni course, and im tryin to do multiple pages scraping with python pandas. Im trying to exact all the data from the table that is in each page in this site https://aaiasb.gr/publications/investigation-reports
after i managed to scrape all the urls i tried this, but i only get the data from the first page:
#imports
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
response=requests.get('https://aaiasb.gr/publications/investigation-reports', )
response
soup = BeautifulSoup(response.text, 'html.parser')
soup
base_url = 'https://aaiasb.gr/publications/investigation-reports'
ending = '?start='
numbers = [50, 100, 150]
urls = [base_url]
for n in numbers:
url = base_url+ending+str(n)
urls.append(url)
df = pd.DataFrame(urls)
df = df.rename(columns={df.columns[0]:'url'})
df
for url in urls:
response = requests.get(url)
time.sleep(3)
soup_doc = BeautifulSoup(response.text, 'html.parser')
entries = []
page=soup.select('div.cck_page_items')[0]
rows = page.find('table').find_all('tr')[1:]
conclusion_date1 = tr.find_all('td')[0].find_all('div')[1].text.strip()
conclusion_date2 = tr.find_all('td')[0].find_all('div')[2].text.strip()
incident_info = tr.find_all('td')[1].find_all('div')[0].text.strip()
incident_type = tr.find_all('td')[1].find_all('div')[1].text.strip()
incident_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[0])
fatalities = tr.find_all('td')[1].find_all('div')[2].text.strip()
fatalities_description = str(tr.find_all('td')[1].find_all('span', attrs={'uk-icon':'info'})[1])
area = tr.find_all('td')[2].find_all('div')[0].text.strip()
registry = tr.find_all('td')[2].find_all('div')[1].text.strip()
aircraft_type = tr.find_all('td')[2].find_all('div')[-2].text.strip()
aircraft_info = tr.find_all('td')[2].find_all('div')[-1].text.strip()
area_info = tr.find_all('td')[2].text.strip()
dict = {'conclusion_date1': conclusion_date1,
'conclusion_date2': conclusion_date2,
'incident_info': incident_info,
'incident_type': incident_type,
'incident_description': incident_description,
'fatalities': fatalities,
'fatalities_description': fatalities_description,
'area': area,
'registry': registry,
'aircraft_type': aircraft_type,
'aircraft_info': aircraft_info,
'area_info': area_info}
entries.append(dict)
df1 =pd.DataFrame(entries)
The main issue has to do with your indentation and location of assignments, e.g. entries = [] and df = pd.DataFrame(entries) need to be in the right places. Try the below.
# imports
from bs4 import BeautifulSoup
import requests
import unicodedata
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
base_url = "https://aaiasb.gr/publications/investigation-reports"
suffix = "?start="
start_indices = [0, 50, 100, 150]
urls = [base_url + suffix + str(index) for index in start_indices]
entries = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.select("div.cck_page_items").pop().find("table")
for row in table.find_all("tr")[1:]:
cols = row.find_all("td")
conclusion_date1 = cols[0].find_all("div")[1].text.strip()
try:
conclusion_date2 = cols[0].find_all("div")[2].text.strip()
except IndexError:
conclusion_date2 = "N/A"
incident_info = cols[1].find_all("div")[0].text.strip()
incident_type = cols[1].find_all("div")[1].text.strip()
fatalities = cols[1].find_all("div")[2].text.strip()
info_hovers = cols[1].find_all("span", attrs={"uk-icon": "info"})
incident_description = ' '.join(unicodedata.normalize("NFC",info_hovers[0]['uk-tooltip']).split())
fatalities_description = ' '.join(unicodedata.normalize("NFC",info_hovers[1]['uk-tooltip']).split()).replace("<br>","\n")
area = cols[2].find_all("div")[0].text.strip()
area_info = '\n'.join(list(cols[2].strings)[-3:]).strip()
registry = cols[2].find_all("div")[1].text.strip()
aircraft_type = cols[2].find_all("div")[-2].text.strip()
aircraft_info = cols[2].find_all("div")[-1].text.strip()
entry = {
"conclusion_date1": conclusion_date1,
"conclusion_date2": conclusion_date2,
"incident_info": incident_info,
"incident_type": incident_type,
"incident_description": incident_description,
"fatalities": fatalities,
"fatalities_description": fatalities_description,
"area": area,
"registry": registry,
"aircraft_type": aircraft_type,
"aircraft_info": aircraft_info,
"area_info": area_info,
}
entries.append(entry)
df = pd.DataFrame(entries)
print(df.head())
print(df.tail())
I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,
I am tried to loop through the lists of lists and scrape all the links and append them to dataframe as one table, but in vain.
Help will be appreciated.
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
data_df = pd.DataFrame()
for sub_tab in company_A_subpg1:
for tab in sub_tab:
sub_table_1 = tab.find_all('a', href=True)
company_name = [name.text.strip() for name in sub_table_1]
company_link = [name.get('href') for name in sub_table_1]
company_link_edit=[convert(name) for name in company_link]
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
data_df.to_csv('results_3.csv')
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('https://money.rediff.com/companies/groups/A')
soup = BeautifulSoup(page.content, 'html.parser')
company_name = []
company_link = []
company_link_edit=[]
company_A_subpg1 = soup.find_all(class_='dataTable')
def convert(url):
if not url.startswith('http://'):
return 'http:' + url
return url
for sub_tab in company_A_subpg1:
temp = sub_tab.find('tbody')
all_rows = temp.find_all('tr')
for val in all_rows:
a_tag = val.find('a', href=True)
company_name.append(a_tag.text.strip())
company_link_edit.append(convert(a_tag.get('href')))
print(len(company_name), len(company_link_edit))
data_df = pd.DataFrame()
df=pd.DataFrame(
{'Name':company_name,
'Link':company_link_edit
})
data_df = pd.concat([data_df, df], sort=False)
print(df.shape)
data_df.to_csv('results_3.csv')
You can check values inside csv file I fetched all the 200 names and link mentioned in the page.
Please help me with the below code. I want to print it in table format with total rows = 35, columns = 6.
from bs4 import BeautifulSoup
import requests
#import urllib.request
from tabulate import tabulate
from selenium import webdriver # for webdriver
from selenium.webdriver.chrome.options import Options # for suppressing the browser
class States():
def __init__(self):
url = "https://www.mohfw.gov.in/"
# self.res = requests.get(url)
# self.soup = BeautifulSoup(self.res.text, 'lxml')
self.op = webdriver.ChromeOptions()
self.op.add_argument('headless')
self.driver = webdriver.Chrome(executable_path= "C:\web drivers\drivers\chromedriver_win32\chromedriver.exe", options= self.op)
self.driver.get(url)
self.driver.find_element_by_class_name("open-table").click()
def get_data(self):
print("S.No" "Name of State / UT" "Active Cases*" "Cured/Discharged/Migrated*" "Deaths**" "Total Confirmed cases*")
self.base_table = self.driver.find_element_by_tag_name("table")
table_row = 35
table_cols = 6
for i in range(1, table_row +1):
for j in range(1, table_cols +1):
print(self.base_table.find_element_by_xpath("//*[#id='state-data']/div/div/div/div/table/tbody/tr[" +str(i)+"]/td[" + str(j) + "]").text)
state=States()
state.get_data()
Could you please provide the url for better understanding? If you are particularly looking to scrape a table data from web, the best way to look use BeautifulSoup. identify the class name and you can simply loop through the row and individual data. have a look at the following snippet
from bs4 import BeautifulSoup
import requests
url = "https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=JYVEVKT1J5S5HQZEVYN1&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=boxoffice&ref_=chtbo_india_tr_rhs_1"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
movie_rating = []
movie_name = []
#identifying the table using class name
imdb_table = soup.find('table', class_ = 'chart full-width')
for imdb in imdb_table.find_all('tbody'):
#find all rows together
rows = imdb.find_all('tr')
#simply loop through individual element
for row in rows:
name = row.find('td', class_ = 'titleColumn').text
movie_name.append(re.sub('[^A-Za-z]+', ' ',name))
rating = row.find('td', class_ = 'ratingColumn imdbRating').text
movie_rating.append(float(rating))
Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()
You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.
A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)