I'm trying to get a list of company names (e.g. 01Venture) and types (e.g. GENERAL PATERNER) from this website https://www.bvca.co.uk/Member-Directory. I'm using the code below:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bvca.co.uk/Member-Directory'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
table = soup.find('table', attrs={'id':'searchresults'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
print(rows)
And I got an empty list.
Use the selenium package, you will need to install chromedriver.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
URL = 'https://www.bvca.co.uk/Member-Directory'
BrowserOptions = Options()
BrowserOptions.add_argument("--headless")
Browser = webdriver.Chrome(executable_path=r'chromedriver.exe', options=BrowserOptions)
Browser.get(URL)
while True:
if Browser.find_elements_by_class_name('companyName'):
break
html_source_code = Browser.execute_script("return document.body.innerHTML;")
soup = BeautifulSoup(html_source_code, 'html.parser')
x = [r.text for r in soup.find_all('h5',class_='companyName')]
print(x)
>>> ['01 Ventures', '01 Ventures', '17Capital LLP', '17Capital LLP', '1818 Venture Capital', ..., 'Zouk Capital LLP', 'Zouk Capital LLP']
The while loop waits until the company names are loaded before the html code is saved
The output was too large to put into the answer, so I could only show some of it.
Related
I'm trying to scrape reviews from TrustPilot, but the code always return with blank sheets and the headers/categories I specified. Could someone help me with this?
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
driver= webdriver.Chrome()
names=[] #List to store name of the product
headers=[] #List to store price of the product
bodies=[]
ratings=[] #List to store rating of the product
dates=[]
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer('a'))
for a in soup.findAll('a', href=True, attrs={'class':'reviews-container'}):
name=a.find('div', attrs={'class':'consumer-information_name'})
header=a.find('div', attrs={'class':'review-content_title'})
body=a.find('div', attrs={'class':'review-content_text'})
rating=a.find('div', attrs={'class':'star-rating star-rating--medium'})
date=a.find('div', attrs={'class':'review-date--tooltip-target'})
names.append(name.text)
headers.append(header.text)
bodies.append(body.text)
ratings.append(rating.text)
dates.append(date.text)
print ('webpage, no errors')
df = pd.DataFrame({'User Name':names,'Header':headers,'Body':bodies,'Rating':ratings,'Date':dates})
df.to_csv('reviews02.csv', index=False, encoding='utf-8')
print ('csv made')```
The issue is soup.findAll('a', href=True, attrs={'class':'reviews-container'}) is not finding any results, so there are 0 iterations in the loop. Make sure you are using the correct tags and class names. Also you don't need to use a loop because BeautifulSoup has a find_all method. I used the requests module to open the web page, though it shouldn't make a difference.
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
And now each list has 20 entries.
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://dciindia.gov.in/DentistsSearch.aspx?Reg_Type=D&RegUnder=0&IDRId=&IDRName=&CourseId=0&RegDate=0&CouncilId='
html = requests.get(url).text
soup = BeautifulSoup(html,'html.parser')
table = soup.find('table',{'id':'gvSearchDentistlist'})
try:
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 6:
data = row.find_all('td')
name = data[1].text.strip()
print("NAME:"+name)
root_url = data[5].input['onclick'].split(",")[4]
link ='http://dciindia.gov.in/'+root_url
print("LINK:"+link)
except:
pass
I wrote this code but its giving output for only first page i want to run this code for all pages in the above site what to do? Please help
The problem is the use of javascript __doPostBack in your webpage. Since no one pointed out on selenium as an alternative, here is an example of clicking the pages on your webpage using selenium:
from bs4 import BeautifulSoup
from selenium import webdriver
def your_func(html):
soup = BeautifulSoup(html,'html.parser')
table = soup.find('table',{'id':'gvSearchDentistlist'})
try:
rows = table.find_all('tr')
for row in rows:
if len(row.find_all('td')) == 6:
data = row.find_all('td')
name = data[1].text.strip()
print("NAME:"+name)
root_url = data[5].input['onclick'].split(",")[4]
link ='http://dciindia.gov.in/'+root_url
print("LINK:"+link)
except:
pass
url = 'http://dciindia.gov.in/DentistsSearch.aspx?Reg_Type=D&RegUnder=0&IDRId=&IDRName=&CourseId=0&RegDate=0&CouncilId='
driver = webdriver.Chrome(executable_path=r'path\chromedriver.exe')
driver.maximize_window()
# first page
driver.get(url)
html = driver.page_source
your_func(html)
# page 2
nextPage = driver.find_element_by_xpath('/html/body/form/div[3]/div/table/tbody/tr[5]/td/fieldset/div/table/tbody/tr[52]/td/table/tbody/tr/td[2]/a')
nextPage.click()
html = driver.page_source
your_func(html)
# page 3
nextPage = driver.find_element_by_xpath('//*[#id="gvSearchDentistlist"]/tbody/tr[52]/td/table/tbody/tr/td[3]/a')
nextPage.click()
html = driver.page_source
your_func(html)
I'm trying to scrape this page have 10 class='name main-name', like this:sample source
but when i code:
import requests
from bs4 import BeautifulSoup
result = requests.get("https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong")
c = result.text
soup = BeautifulSoup(c, "html.parser")
comment_items = soup.find_all('div', class_="name main-name")
print(len(comment_items)
but return : 0 not return : 10. I have tried search and use many solutions in stackoverflow but cann't fix
Because div name main-name doens't appear in your DOM . In this case using Selenium is more powerful than BeautifulSoap
from selenium import webdriver
driver_path = r'Your Chrome driver path'
browser = webdriver.Chrome(executable_path=driver_path)
browser.get("https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong")
get_element = browser.find_elements_by_css_selector("div[class='name main-name']")
print len(get_element)
browser.close()
OUTPUT :
10
And you can also get names like:
for users in get_element:
print(users.text)
OUTPUT :
Phạm Thị Kim Chi
My Linh Nguyen
Mr Vinh Bảo Hiểm Sức Khoẻ Sắc Đẹp
Ngô Thị Tuyết
Huỳnh Thị Bích Trâm
Linh Trúc Diêm
Nguyen Tu
Nguyen Thom
Hồ Thu Trang
Trầnthịtrắng
As I stated in the comments, it's generated dynamically. So here's an implementation with Selenium:
from selenium import webdriver
from bs4 import BeautifulSoup
url = "https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong"
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get(url)
c = driver.page_source
soup = BeautifulSoup(c, "html.parser")
comment_items = soup.find_all('div', {'class':"name main-name"})
print (len(comment_items))
driver.close()
Output:
print (len(comment_items))
10
You can use beautifulsoup4 select function
import requests
from bs4 import BeautifulSoup
result = requests.get("https://genvita.vn/thu-thach/7-ngay-detox-da-dep-dang-thon-nguoi-khoe-qua-soc-len-den-8-trieu-dong")
c = result.text
soup = BeautifulSoup(c, "html.parser")
comment_items = soup.select("div.name.main-name")
print(len(comment_items))
I am trying to download the data on this website
https://coinmunity.co/
...in order to manipulate later it in Python or Pandas
I have tried to do it directly to Pandas via Requests, but did not work, using this code:
res = requests.get("https://coinmunity.co/")
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')[0]
dfm = pd.read_html(str(table), header = 0)
dfm = dfm[0].dropna(axis=0, thresh=4)
dfm.head()
In most of the things I tried, I could only get to the info in the headers, which seems to be the only table seen in this page by the code.
Seeing that this did not work, I tried to do the same scraping with Requests and BeautifulSoup, but it did not work either. This is my code:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://coinmunity.co/")
soup = BeautifulSoup(res.content, 'lxml')
#table = soup.find_all('table')[0]
#table = soup.find_all('div', {'class':'inner-container'})
#table = soup.find_all('tbody', {'class':'_ngcontent-c0'})
#table = soup.find_all('table')[0].findAll('tr')
#table = soup.find_all('table')[0].find('tbody')#.find_all('tbody _ngcontent-c3=""')
table = soup.find_all('p', {'class':'stats change positiveSubscribers'})
You can see in the lines commented, all the things I have tried, but nothing worked.
Is there any way to easily download that table to use it on Pandas/Python, in the tidiest, easier and quickest possible way?
Thank you
Since the content is loaded dynamically after the initial request is made, you won't be able to scrape this data with request. Here's what I would do instead:
from selenium import webdriver
import pandas as pd
import time
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
driver.implicitly_wait(10)
driver.get("https://coinmunity.co/")
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
results = []
for row in soup.find_all('tr')[2:]:
data = row.find_all('td')
name = data[1].find('a').text
value = data[2].find('p').text
# get the rest of the data you need about each coin here, then add it to the dictionary that you append to results
results.append({'name':name, 'value':value})
df = pd.DataFrame(results)
df.head()
name value
0 NULS 14,005
1 VEN 84,486
2 EDO 20,052
3 CLUB 1,996
4 HSR 8,433
You will need to make sure that geckodriver is installed and that it is in your PATH. I just scraped the name of each coin and the value but getting the rest of the information should be easy.
As subject, I try to fetch the table using beautifulsoup.
http://www.hkjc.com/english/racing/Horse.asp?HorseNo=T421
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import lxml
import xlrd
HorseNo = ["T421"]
driver = webdriver.PhantomJS(r'D:\Program Files\Python\Path\PhantomJS\bin\phantomjs.exe')
#driver = webdriver.Chrome(r'D:\Program Files\Python\Path\chromedriver.exe')
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
print(table)
for row in table:
cells = row.findAll("td")
print(cells)
Print(table) result is fine though print(cells) is not able to return every td in the table. Would somebody advise me further. Thanks.
try this below using requests
from bs4 import BeautifulSoup
import requests
HorseNo = ["T421"]
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
cells = []
for row in table:
cell = row.findAll("td")
cells.append(cell)
print(cells)