I want to extract from the id but every id has different value check it:
div',id='statement80863
div',id='statement26092
and so on ............................
CODE
import requests
from bs4 import BeautifulSoup
import re
limit = 100
url = f'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=ÐNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F{limit}&SORT=LASTNAME'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.find_all('div', {'class':'row'})
for row in rows:
des=row.find('div',id='statement80863').text
print(des)
You can use Regular Expressions to select only such <div> tags.
row.find('div', {'id': re.compile('^statement.*')}) - will select all the <div> tags that has an id which starts with the word statement.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=ÐNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F100&SORT=LASTNAME'
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
rows = soup.find_all('div', class_='row')
for row in rows:
d = row.find('div', {'id': re.compile('^statement*')})
if d:
# Your scraping code here...
Related
I'm trying to get all the category, sub category and sub sub category and so on of authors URL from dmoz website using BeautifulSoup.
I'm getting the following output:
# Missing the every 2nd option/URL in first step
/Arts/Literature/Authors/A
/Arts/Literature/Authors/C
/Arts/Literature/Authors/E
/Arts/Literature/Authors/G
/Arts/Literature/Authors/Horror
. . .
# Missing the every 1st option/URL in second step
/Arts/Literature/Authors/A/Abbey,_Lynn
/Top/Arts/Literature/Authors/A/Abe,_Kobo
In the above code 2nd element is missing in 1st step and 1st element in 2nd step.
Here is my code:
scrape_url = "http://dmoz.org/Arts/Literature/Authors"
page = session.get(scrape_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# get all the root category author list
for test in find_row:
if test.find('div', attrs = {'class':'panel-body'}):
test_link = test.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
# now get the sub or sub-sub category author URL list
for cat in sub_cat:
scrape_cat_url = "http://dmoz.org%s" % (cat)
print('scraping...', scrape_cat_url)
page = session.get(scrape_cat_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# if sub category go next level or restart
for row in find_row:
if row.find('div', attrs = {'class':'panel-body'}):
test_link = row.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
records.append(scrape_cat_url)
else:
records.append(scrape_cat_url)
# remove the category url from the sub_cat list
sub_cat.remove(cat)
Can anybody suggest a better way to get all the category, sub category and sub sub category URL of authors?
Try this streamlined version of your code:
from bs4 import BeautifulSoup
headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
scrape_url = "http://dmozlive.com/Top/Arts/Literature/Authors"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
cats = []
for row in find_rows:
links = row.find_all('a')
for link in links:
cats.append(link['href'])
cats
Print out:
['/Top/Arts/Literature/Authors/A',
'/Top/Arts/Literature/Authors/B',
'/Top/Arts/Literature/Authors/C',
'/Top/Arts/Literature/Authors/D',
'/Top/Arts/Literature/Authors/E',
'/Top/Arts/Literature/Authors/F',
…
Now get the subcategories:
sub_cats = []
for cat in cats:
scrape_url = f"http://dmozlive.com{cat}"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
for row in find_rows:
links = row.find_all('a')
for link in links:
sub_cats.append(link['href'])
subcats
Print out:
['/Top/Arts/Literature/Authors/A/Abbey,_Edward',
'/Top/Arts/Literature/Authors/A/Abbey,_Lynn',
'/Top/Arts/Literature/Authors/A/Abbott,_Edwin_A.',
'/Top/Arts/Literature/Authors/A/Abe,_Kobo',
'/Top/Arts/Literature/Authors/A/Achebe,_Chinua',
'/Top/Arts/Literature/Authors/A/Ackroyd,_Peter',
'/Top/Arts/Literature/Authors/A/Adams,_Douglas',
…
The following code may meet your expectation that's pull all the categories and sub-categories urls.
import requests
from bs4 import BeautifulSoup
url= 'http://dmozlive.com/Top/Arts/Literature/Authors'
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"}
req=requests.get(url,headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
for cat_url in soup.select('.list-group.col-md-6 a'):
cat_url = 'http://dmozlive.com' + cat_url.get('href')
#print(cat_url)
req2=requests.get(cat_url,headers=headers)
soup2 = BeautifulSoup(req2.text, 'html.parser')
for author_url in soup2.select('.list-group-item'):
author_url= 'http://dmozlive.com' + str(author_url.get('href'))
print(author_url)
I'm trying to webscrape different stocks by rows, with the data scraped from https://www.slickcharts.com/sp500. I am following a tutorial using a similar website, however that website uses classes for each of its rows, while mine doesn't (attached below).
This is the code I'm trying to use, however I don't get any output whatsoever. I'm still pretty new at coding so any feedback is welcome.
import requests
import pandas as pd
from bs4 import BeautifulSoup
company = []
symbol = []
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
rows = soup.find_all('tr')
for i in rows:
row = i.find_all('td')
print(row[0])
First of all, you need to add some headers to your request because most likely you get the same as me: status code 403 Forbidden. It's because the website is blocking your request. Adding User-Agent does the trick:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
page = requests.get(url, headers=headers)
Then you can iterate over tr tags as you do. But you should be careful, because, for example first tr doesn't have td tags and you will get exception in the row:
print(row[0])
Here is the example of code that prints names of all companies:
import requests
from bs4 import BeautifulSoup
company = []
symbol = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
rows = soup.find_all('tr')
for row in rows:
all_td_tags = row.find_all('td')
if len(all_td_tags) > 0:
print(all_td_tags[1].text)
But this code also outputs some other data besides company names. It's because you are iterating over all tr tags on the page. But you need to iterate over a specific table only (first table on the page in this case).
import requests
from bs4 import BeautifulSoup
company = []
symbol = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
first_table_on_the_page = soup.find('table')
rows = first_table_on_the_page.find_all('tr')
for row in rows:
all_td_tags = row.find_all('td')
if len(all_td_tags) > 0:
print(all_td_tags[1].text)
How to get all 'href' with soup in python ? I try so many times but in vain.
Whatever I use 'soup.find' or 'soup.find_all' method to strugle for the 'href', it doesn't work.
python version:3.10
!pip install requests
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = "https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.find_all('ul',class_="searchItem Stype"):
print(len(link))
Link = link.li.a
LINK = Link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
sorry i misunderstood totally your problem. find_all is not a very versatile tool and you were searching for the wrong ul
i barely changed your code but it seems to work now
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = f"https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.select('ul#surveyContent > li > a[href]:first-of-type'):
print(len(link))
# ~ Link = link.li.a
LINK = link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
for page in range(1,2):
url = "https://m.momomall.com.tw/m/store/DCategory.jsp?entp_code=103487&category_code=all&orderby=3&page={}".format(page)
r = requests.get(url,headers = headers)
soup = BeautifulSoup(r.text,'lxml')
for goods_code in soup.select('a.nofollowBtn_star'):
Goods_code = 'https://www.momomall.com.tw/s/103487/'+goods_code.get('goods_code')+'/'
goodlink.append(Goods_code)
for URL in goodlink:
R = requests.get(URL, headers = headers)
Soup = BeautifulSoup(R.text,"lxml")
for dataprice in Soup.select('script'):
import re
discount_regex=re.compile('discountPrice = (\d{1,5})')
print(re.search(discount_regex, dataprice).group(1))
```
import urllib.request
from bs4 import BeautifulSoup
headers = {'User-Agent':' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}
url = 'https://finance.yahoo.com/quote/KO/history?period1=-252374400&period2=1595116800&interval=1mo&filter=history&frequency=1mo'
req = urllib.request.Request(url, headers=headers)
resp = urllib.request.urlopen(req)
html = resp.read()
soup = BeautifulSoup(html,'html.parser')
data = soup.find_all("tr", {'class':'BdT Bdc($seperatorColor) Ta(end) Fz(s) Whs(nw)'})
data = [x.get_text("|") for x in data]
When I go through what it finds, it only finds the first 75 instances of that tag and ignores the rest. How do I get more than 75 instances?
Any idea how can i retrieve the price (now 2917.99) from this source code view-https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/
If I call the class p.product-new-price i get None.
I have managed to get the title, but not the price.
What I have done so far:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
div = soup.find('div', {"class" : 'product-new-price'})
text = div.string
print(text)
The class looks like below and I want to extract the 2917 as int.
div class="product-highlight product-page-pricing"
p class="product-new-price"
2.917<sup>99</sup> <span>Lei</span>
Thank you very much!
Ok, with minor modifications:
It seems that the class product-new-price is on the p element for me!
I am assuming there will always be a <sup> tag after the main price
import requests
from bs4 import BeautifulSoup
URL = 'https://www.emag.ro/televizor-led-smart-samsung-138-cm-55ru7402-4k-ultra-hd-ue55ru7402uxxh/pd/DTN2XZBBM/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get(URL, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('title')
p = soup.find('p', {"class" : 'product-new-price'})
# Get the text before <sup> tag
value = p.find('sup').previousSibling.strip()
print("Value: {}".format(value))
# Keep only numbers
value = ''.join(c for c in value if c.isdigit())
price = int(value)
print("Price: {}".format(price))
The above prints:
$ python3 ./test.py
Value: 2.917
Price: 2917
Now, with small changes you can also add the missing .99 if this is required