Scrape multiple pages with Beautiful soup

Scrape multiple pages with Beautiful soup - python

I am trying to scrape multiple pages of a url.
But am able to scrape only the first page is there is a way to get all the pages.
Here is my code.
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
pd.set_option('max_colwidth',500) # to remove column limit (Otherwise, we'll lose some info)
df = pd.DataFrame()
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
comp_name = elem.find('span', attrs={'class':'company'}).getText().strip()
job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
home_url = "http://www.indeed.com"
job_link = "%s%s" % (home_url,elem.find('a').get('href'))
job_addr = elem.find('span', attrs={'class':'location'}).getText()
date_posted = elem.find('span', attrs={'class': 'date'}).getText()
description = elem.find('span', attrs={'class': 'summary'}).getText().strip()
comp_link_overall = elem.find('span', attrs={'class':'company'}).find('a')
if comp_link_overall != None:
comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
else: comp_link_overall = None
df = df.append({'comp_name': comp_name, 'job_title': job_title,
'job_link': job_link, 'date_posted': date_posted,
'overall_link': comp_link_overall, 'job_location': job_addr, 'description': description
}, ignore_index=True)
df
df.to_csv('path\\web_scrape_Indeed.csv', sep=',', encoding='utf-8')
Please suggest if there is anyway.

Case 1: The code presented here is exactly what you have
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
The problem here is targetElements changes with every iteration in the first for loop.
To avoid this, indent the second for loop inside the first like so:
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
Case 2: Your the bug is not a result of improper indentation (i.e. not like what is in your original post)
If it is the case that your code is properly idented , then it may be the case that targetElements is an empty list. This means target.findAll('div', class_ =' row result') does not return anything. In that case, visit the sites, check out the dom, then modify your scraping program.

Related

Webscraping with BS4 NoneType object has no attribute find

I'm not sure why my code isn't working. I get AttributeError: 'NoneType' object has no attribute 'find'
My code is as follows:
import requests
from bs4 import BeautifulSoup
import csv
root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
html = requests.get(root_url)
soup = BeautifulSoup(html.text, 'html.parser')
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
start_page = paging[1].text
last_page = paging[len(paging)-2].text
outfile = open('congregationlookup.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1,int(last_page)+1))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' %(page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup.prettify())
print ('Processing page: %s' %(page))
name_list = soup.findAll("div",{"class":"views-field views-field-congregation"})
for element in name_list:
name = element.find('h3').text
address = element.find('field-content mb-2').text.strip()
phone = element.find("i",{"class":"fa fa-phone mr-1"}).text.strip()
writer.writerow([name, address, phone])
outfile.close()
print ('Done')
I'm trying to scrape the name, address, and phone number from the URJ Congregations website.
Thank you

Final code
import csv
import requests
from bs4 import BeautifulSoup
# root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
# html = requests.get(root_url)
# soup = BeautifulSoup(html.text, 'html.parser')
# paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
# start_page = paging[1].text
# last_page = paging[len(paging) - 3].text
outfile = open('congregationlookup.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1, 1000))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' % (
page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# print(soup.prettify())
print('Processing page: %s' % (page))
elements = soup.find_all("div", {"class": "views-row"})
if len(elements) == 0:
break
for element in elements:
name = element.find("div", {"class": "views-field views-field-congregation"}).text.strip()
address = element.find("div", {"class": "views-field views-field-country"}).text.strip()
phone = element.find("div", {"class": "views-field views-field-website"}).text.strip().split("\n")[0]
writer.writerow([name, address, phone])
outfile.close()
print('Done')

Most likely, your name_list contains a None type. So, when you attempt to run element.find(), you are performing a string operation on a None, hence your error.
https://docs.python.org/3/library/stdtypes.html#str.find
Also as an FYI, findAll() is bs3 syntax. You should use find_all() Difference between "findAll" and "find_all" in BeautifulSoup

There is a load of problems
The first problem is
"pagination-heading--3"
istead of
"pagination-heading-3"
Next i changed
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
To
paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
This was the line where i swapped first problematic string. And also i changed the second search to find ul. You were trying to find 1 li and searching inside of it. This would have reproduced empty list
Next
last_page = paging[len(paging) - 3].text
as you are trying to get 3rd element from the end
It still doesn't work, i will keep updating

Web scraping - scraping data for multiple URL's gives None

1) I am trying to scrape data for multiple URL's stored in CSV, but in result it gives None.
2) I want to store the fetched data simultaneously in rows one by one in a dataframe named df but it only stores one row.
here's my code(i have pasted below from where the data extraction started) -
import csv
df=pd.DataFrame()
with open('test1.csv', newline='', encoding='utf-8-sig' ) as f:
reader = csv.reader(f)
for line in reader:
link = line[0]
print(type(link))
print(link)
driver.get(link)
height = driver.execute_script("return document.body.scrollHeight")
for scrol in range(100,height,100):
driver.execute_script(f"window.scrollTo(0,{scrol})")
time.sleep(0.2)
src = driver.page_source
soup = BeautifulSoup(src, 'lxml')
name_div = soup.find('div', {'class': 'flex-1 mr5'})
name_loc = name_div.find_all('ul')
name = name_loc[0].find('li').get_text().strip()
loc = name_loc[1].find('li').get_text().strip()
connection = name_loc[1].find_all('li')
connection = connection[1].get_text().strip()
exp_section = soup.find('section', {'id': 'experience-section'})
exp_section = exp_section.find('ul')
div_tag = exp_section.find('div')
a_tag = div_tag.find('a')
job_title = a_tag.find('h3').get_text().strip()
company_name = a_tag.find_all('p')[1].get_text().strip()
joining_date = a_tag.find_all('h4')[0].find_all('span')[1].get_text().strip()
exp = a_tag.find_all('h4')[1].find_all('span')[1].get_text().strip()
df['name']=[name]
df['location']=[loc]
df['connection']=[connection]
df['company_name']=[company_name]
df['job_title']=[job_title]
df['joining_date']=[joining_date]
df['tenure']=[exp]
df
output -
name location connection company_name job_title joining_date tenure
0 None None None None None None None
I am not sure whether the for loop goes wrong or whats the exact problem but for a single URL it works fine.
I am using Beautiful soup for the first time so I don't have proper knowledge. Please help me to make the desired changes. Thanks.

I don't think the end of your code is appending new lines to the dataframe.
Try replacing df["name""] = [name] and the other lines with the following:
new_line = {
"name": [name],
"location": [loc],
"connection": [connection],
"company_name": [company_name],
"job_title": [job_title],
"joining_date": [joining_date],
"tenure": [exp],
}
temp_df = pd.DataFrame.from_dict(new_line)
df.append(temp_df)

Beautiful soup how select <a href> and <td> elements with whitespaces

I'm trying to use BeautifulSoup to select the date, url, description, and additional url from table and am having trouble accessing them given the weird white spaces:
So far I've written:
import urllib
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = make_soup('https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2010.shtml')
test1 = soup.findAll("td", {"nowrap" : "nowrap"})
test2 = [item.text.strip() for item in test1]

With bs4 4.7.1 you can use :has and nth-of-type in combination with next_sibling to get those columns
from bs4 import BeautifulSoup
import requests, re
def make_soup(url):
the_page = requests.get(url)
soup_data = BeautifulSoup(the_page.content, "html.parser")
return soup_data
soup = make_soup('https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2010.shtml')
releases = []
links = []
dates = []
descs = []
addit_urls = []
for i in soup.select('td:nth-of-type(1):has([href^="/litigation/litreleases/"])'):
sib_sib = i.next_sibling.next_sibling.next_sibling.next_sibling
releases+= [i.a.text]
links+= [i.a['href']]
dates += [i.next_sibling.next_sibling.text.strip()]
descs += [re.sub('\t+|\s+',' ',sib_sib.text.strip())]
addit_urls += ['N/A' if sib_sib.a is None else sib_sib.a['href']]
result = list(zip(releases, links, dates, descs, addit_urls))
print(result)

Unfortunately there is no class or id HTML attribute to quickly identify the table to scrape; after experimentation I found it was the table at index 4.
Next we ignore the header by separating it from the data, which still has table rows that are just separations for quarters. We can skip over these using a try-except block since those only contain one table data tag.
I noticed that the description is separated by tabs, so I split the text on \t.
For the urls, I used .get('href') rather than ['href'] since not every anchor tag has an href attribute from my experience scraping. This avoids errors should that case occur. Finally the second anchor tag does not always appear, so this is wrapped in a try-except block as well.
data = []
table = soup.find_all('table')[4] # target the specific table
header, *rows = table.find_all('tr')
for row in rows:
try:
litigation, date, complaint = row.find_all('td')
except ValueError:
continue # ignore quarter rows
id = litigation.text.strip().split('-')[-1]
date = date.text.strip()
desc = complaint.text.strip().split('\t')[0]
lit_url = litigation.find('a').get('href')
try:
comp_url = complaint.find('a').get('href')
except AttributeError:
comp_ulr = None # complaint url is optional
info = dict(id=id, date=date, desc=desc, lit_url=lit_url, comp_url=comp_url)
data.append(info)

Python append adding same data

I'm trying to extract the stock price and the market cap data from a Korean website.
Here is my code:
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
if td.find('a'):
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
And this is the result I get. (Sorry for the Korean characters)
['삼성전자', '43,650', '100']
['', '43,650', '100']
['SK하이닉스', '69,800', '5,000']
['', '69,800', '5,000']
The second and the fourth line in the outcome should not be there. I just want the first and the third line. Where do line two and four come from and how do I get rid of them?

My dear friend, I think the problem is you should check if td.find('a').text have values!
So I change your code to this and it works!
import requests
from bs4 import BeautifulSoup
response = requests.get(
'http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {'class': 'type_2'})
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
# where magic happends!
if td.find('a') and td.find('a').text:
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep="\n")

While I can't test it, it could be because there are two a tags on the page you're trying to scrape, while your for loop and if statement is set up to append information whenever it finds an a tag. The first one has the name of the company, but the second one has no text, thus the blank output (because you do td.find('a').text, it tries to get the text of the target a tag).
For reference, this is the a tag you want:
삼성전자
This is what you're picking up the second time around:
<img src="https://ssl.pstatic.net/imgstock/images5/ico_debatebl2.gif" width="15" height="13" alt="토론실">
Perhaps you can change your if statement to make sure the class of the a tag is title or something to make sure that you only enter the if statement when you're looking at the a tag with the company name in it.
I'm at work so I can't really test anything, but let me know if you have any questions later!

check tds it should be equal to 13 and no need multiple for loop
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = tr.find_all('td')
if len(tds) == 13:
company_name = tds[1].text
price_now = tds[2].text
market_cap = tds[6].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
result
['삼성전자', '43,650', '2,802,035']
['SK하이닉스', '69,800', '508,146']
['삼성전자우', '35,850', '323,951']
['셀트리온', '229,000', '287,295']
['LG화학', '345,500', '243,897']

Web scraping coinmarketcap.com with Python (requests & BeautifulSoup)

I want to build a list with coins from coinmarketcap.com.
Every element should be a tuple.
Something like:
coins = [('btc',8500,'+0.5%','+1.2%', '-1%'), ...]
I can't get percentage:
The information is in td like this:
<td class="no-wrap percent-change text-right positive_change" data-timespan="1h" data-percentusd="0.99" data-symbol="BTC" data-sort="0.991515">0.99%</td>
How can I access 0.99% value from above? I need in fact data-percentageusd from td but I don't know what that is.
My testing script is something like:
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/all/views/all/'
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
name = soup.find_all('a', class_='currency-name-container')
price = soup.find_all('a', class_='price')
print(name)
print(price)
#how can percentage modification for 1h, 24h, 7d?
#delta_h = soup.find_all('td', ???)

You can loop over the rows of the table to get the data for each currency and store it in a tuple, and then add it to the list.
r = requests.get('https://coinmarketcap.com/all/views/all/')
soup = BeautifulSoup(r.text, 'lxml')
data = []
table = soup.find('table', id='currencies-all')
for row in table.find_all('tr'):
try:
symbol = row.find('td', class_='text-left col-symbol').text
price = row.find('a', class_='price').text
time_1h = row.find('td', {'data-timespan': '1h'}).text
time_24h = row.find('td', {'data-timespan': '24h'}).text
time_7d = row.find('td', {'data-timespan': '7d'}).text
except AttributeError:
continue
data.append((symbol, price, time_1h, time_24h, time_7d))
for item in data:
print(item)
Partial Output:
('BTC', '$8805.46', '0.88%', '-12.30%', '-19.95%')
('ETH', '$677.45', '0.98%', '-11.54%', '-21.66%')
('XRP', '$0.780113', '0.62%', '-10.63%', '-14.42%')
('BCH', '$970.70', '1.01%', '-11.33%', '-23.89%')
('LTC', '$166.70', '0.74%', '-10.06%', '-19.56%')
('NEO', '$83.55', '0.24%', '-16.29%', '-33.39%')
('XLM', '$0.286741', '1.13%', '-13.23%', '-11.84%')
('ADA', '$0.200449', '0.63%', '-16.92%', '-31.43%')
('XMR', '$256.92', '0.63%', '-19.98%', '-19.46%')
Since the data is missing for some currencies in the table, the code will raise an AttributeError for .text. To skip those currencies, I've used the try-except.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrape multiple pages with Beautiful soup - python

Related

Webscraping with BS4 NoneType object has no attribute find

Web scraping - scraping data for multiple URL's gives None

Beautiful soup how select <a href> and <td> elements with whitespaces

Python append adding same data

Web scraping coinmarketcap.com with Python (requests & BeautifulSoup)

Categories

Resources