Web scraping with beautifulsoup: find and replace missing nodes with None - python

I'm using the following code to scrape web items with Beaufulsoup:
item_id = []
items = soup.find_all('div', class_ = 'item-id')
for one_item in items:
list_item = one_item.text
item_id.append(list_item)
However, some items are missing and when I run the code, I one get the list of the items available. How can I proceed to get the entire list including the missings listed as "None" ?

import requests
from bs4 import BeautifulSoup as bsoup
site_source = requests.get("https://search.bvsalud.org/global-literature-on-novel-coronavirus-2019-ncov/?output=site&lang=en&from=0&sort=&format=summary&count=100&fb=&page=1&skfp=&index=tw&q=%28%22rapid+test%22+OR+%22rapid+diagnostic+test%22%29+AND+sensitivity+AND+specificity").content
soup = bsoup(site_source, "html.parser")
item_list = soup.find_all('div', class_ = 'textArt')
result_list = []
for item in item_list:
result = item.find('div', class_='reference')
if result is None:
result_list.append('None')
else:
result_list.append(result.text)
for result in result_list:
print(result)

Related

Extracting URL From Span Element Without href

I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File
You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')

How to list out all the h2, h3, and p tags then create a dataframe to store them

I had given a website to scrape all of the key items
But the output I got is only for one item using BeautifulSoup4. So wonder if I need to use anything like soup.findall to extract all the key items in a list from the website.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url=
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
column= soup.find(class_ = re.compile('columns is-multiline'))
print(column.prettify())
position = column.h2.text
company = column.h3.text
city_state= column.find_all('p')[-2].text
print (position, company, city_state)
Thank you.
Try this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://realpython.github.io/fake-jobs/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
positions = [pos.text for pos in soup.find_all('h2')]
companies = [com.text for com in soup.find_all('h3')]
city_state0 = []
city_state1 = []
for p in soup.find_all('p', {'class' : 'location'}):
city_state0.append(p.text.split(',')[0].strip())
city_state1.append(p.text.split(',')[1].strip())
df = pd.DataFrame({
'city_state1': city_state0,
'city_state2': city_state1,
'companies' : companies,
'positions' : positions
})
print(df)
Output:
You need to use find_all to get all the elements like so. find only gets the first element.
titles = soup.find_all('h2', class_='title is-5')
companies = soup.find_all('h3', class_='subtitle is-6 company')
locations = soup.find_all('p', class_='location')
# loop over locations and extract the city and state
for location in locations:
city = location.split(', ')[0]
state = location.split(', ')[1]

Getting an AttributeError, when adding .text attribute

I have tried the script below and it works just fine:
from bs4 import BeautifulSoup
import requests
pr= input("search: ")
source= requests.get('https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'.format(pr)).content
soup = BeautifulSoup(source, 'html.parser')
url= soup.find_all('div', class_=('_3O0U0u'))
whole_product_list= []
whole_url_list= []
main_product_list= []
main_url_list= []
for i in url:
tag_a_data= i.find_all('a')
for l in tag_a_data:
product_list= l.find('div', class_= '_3wU53n')
if product_list:
main_product_list.append(product_list.text)
else:
product_ok= l.get('title')
main_product_list.append(product_ok)
print(main_product_list)
so for example, if I pass "samsung" as input it returns a list for available attribute "div" with the given class Id, which is passed as arguments and if I pass something else as input like "shoes" which has "title" attribute it returns a list of all the titles available in it's html.
But if I reverse the order, like below:
from bs4 import BeautifulSoup
import requests
pr= input("search: ")
source= requests.get('https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'.format(pr)).content
soup = BeautifulSoup(source, 'html.parser')
url= soup.find_all('div', class_=('_3O0U0u'))
whole_product_list= []
whole_url_list= []
main_product_list= []
main_url_list= []
for i in url:
tag_a_data= i.find_all('a')
for l in tag_a_data:
product_list = l.get('title')
if product_list:
main_product_list.append(product_list)
else:
product_ok= l.find('div', class_= '_3wU53n').text
main_product_list.append(product_ok)
print(main_product_list)
it starts giving an attribute error:
Traceback (most recent call last):
File "tess.py", line 28, in <module>
product_ok= l.find('div', class_= '_3wU53n').text
AttributeError: 'NoneType' object has no attribute 'text'
I'm not getting why the first script is working fine based on if-else operation but second is not.
In this line:
product_ok= l.find('div', class_= '_3wU53n').text
l.find('div', class_= '_3wU53n') returns None, meaning it doesn't find the div. None values haven't got a text property, so it raises an AttributeError exception.
A fix would be to use the new walrus operator:
if product_ok := l.find('div', class_= '_3wU53n'):
product_ok = product_ok.text
Suppose you have the following data collected for your "l" values
item1 <title>title1</title><div class_= '_3wU53n'>xyz</div>
item2 <title>title1</title><div>xyz</div>
item3 <title>title1</title><div class_= '_3wU53n'>xyz</div>
Using the first code, your product_list variable will contain item1 and item3. Then you can get the title of the given items as they are available. So the code works without any problem.
Using the second code, your product_list variable will contain item1, item2, and item3. But in this case, you won't get the required div tag, as it doesn't exist for the second item. This causes the attribute error.
The simple thing is items in the database will always have a title, but most likely won't have the required div tag always.
The following change should get it working:
from bs4 import BeautifulSoup
import requests
pr= input("search: ")
source= requests.get('https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'.format(pr)).content
soup = BeautifulSoup(source, 'html.parser')
url= soup.find_all('div', class_=('_3O0U0u'))
whole_product_list= []
whole_url_list= []
main_product_list= []
main_url_list= []
for i in url:
tag_a_data= i.find_all('a')
for l in tag_a_data:
product_list = l.get('title')
if product_list:
main_product_list.append(product_list)
else:
if l.find("div", class_='_3wU53n'):
product_ok= l.find('div', class_= '_3wU53n').text
main_product_list.append(product_ok)
print(main_product_list)

Python issue for crawling multiple page title

I am a marketer and want to conduct some basic market research using Python.
I wrote a simple coding to crawl multiple pages of title, but it does not work to put the title text in the list and to transfer it into Excel format. How can I do in this case?
I tried to create a list and used the extend() method to put these looped titles on the list, but it did not work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
def content_get(url):
count = 0
while count < 4: #this case was to crawl titles of 4 pages
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
titles = soup.find(id="main-container").find_all("div", class_="r-ent")
for title in titles:
print([title.find('div', class_='title').text])
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
firstpage = "https://www.ptt.cc/bbs/movie/index9002.html"
content_get(firstpage)
You need to add the titles to a list outside of the while loop:
def content_get(url):
count = 0
titles = []
while count < 4:
r = requests.get(url)
soup = BeautifulSoup(r.text)
title_page = [title.text.replace('\n', '') for title in soup.find_all('div', {'class': 'title'})]
titles.extend(title_page)
nextpageurl = soup.find("a", string="‹ 上頁")["href"]
url = "https://www.ptt.cc" + nextpageurl
count += 1
return titles
If you don't want the list comprehension to get titles_page, that can be replaced with a traditional for loop:
titles_page = []
titles = soup.find_all('div', {'class': 'title'})
for title in titles:
titles_page.append(title.text.replace('\n', ''))
For the excel file:
def to_excel(text):
df = pd.DataFrame(text, columns=['Title'])
return df.to_excel('output.xlsx')

Web scraping coinmarketcap.com with Python (requests & BeautifulSoup)

I want to build a list with coins from coinmarketcap.com.
Every element should be a tuple.
Something like:
coins = [('btc',8500,'+0.5%','+1.2%', '-1%'), ...]
I can't get percentage:
The information is in td like this:
<td class="no-wrap percent-change text-right positive_change" data-timespan="1h" data-percentusd="0.99" data-symbol="BTC" data-sort="0.991515">0.99%</td>
How can I access 0.99% value from above? I need in fact data-percentageusd from td but I don't know what that is.
My testing script is something like:
import requests
from bs4 import BeautifulSoup
url = 'https://coinmarketcap.com/all/views/all/'
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
name = soup.find_all('a', class_='currency-name-container')
price = soup.find_all('a', class_='price')
print(name)
print(price)
#how can percentage modification for 1h, 24h, 7d?
#delta_h = soup.find_all('td', ???)
You can loop over the rows of the table to get the data for each currency and store it in a tuple, and then add it to the list.
r = requests.get('https://coinmarketcap.com/all/views/all/')
soup = BeautifulSoup(r.text, 'lxml')
data = []
table = soup.find('table', id='currencies-all')
for row in table.find_all('tr'):
try:
symbol = row.find('td', class_='text-left col-symbol').text
price = row.find('a', class_='price').text
time_1h = row.find('td', {'data-timespan': '1h'}).text
time_24h = row.find('td', {'data-timespan': '24h'}).text
time_7d = row.find('td', {'data-timespan': '7d'}).text
except AttributeError:
continue
data.append((symbol, price, time_1h, time_24h, time_7d))
for item in data:
print(item)
Partial Output:
('BTC', '$8805.46', '0.88%', '-12.30%', '-19.95%')
('ETH', '$677.45', '0.98%', '-11.54%', '-21.66%')
('XRP', '$0.780113', '0.62%', '-10.63%', '-14.42%')
('BCH', '$970.70', '1.01%', '-11.33%', '-23.89%')
('LTC', '$166.70', '0.74%', '-10.06%', '-19.56%')
('NEO', '$83.55', '0.24%', '-16.29%', '-33.39%')
('XLM', '$0.286741', '1.13%', '-13.23%', '-11.84%')
('ADA', '$0.200449', '0.63%', '-16.92%', '-31.43%')
('XMR', '$256.92', '0.63%', '-19.98%', '-19.46%')
Since the data is missing for some currencies in the table, the code will raise an AttributeError for .text. To skip those currencies, I've used the try-except.

Categories