Beautiful soup looping through array of URLs - python

I'm trying to loop through an array of URLs and scrape board members from a list of companies. There seems to be a problem with my loop below, where it's only running the first element in the array and duplicating results. Any help with this would be appreciated. Code:
from bs4 import BeautifulSoup
import requests
#array of URLs to loop through, will be larger once I get the loop working correctly
tickers = ['http://www.reuters.com/finance/stocks/companyOfficers?symbol=AAPL.O', 'http://www.reuters.com/finance/stocks/companyOfficers?symbol=GOOG.O']
board_members = []
output = []
soup = BeautifulSoup(html, "html.parser")
for t in tickers:
html = requests.get(t).text
officer_table = soup.find('table', {"class" : "dataTable"})
for row in officer_table.find_all('tr'):
cols = row.find_all('td')
if len(cols) == 4:
board_members.append((t, cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip(), cols[3].text.strip()))
for t, name, age, year_joined, position in board_members:
output.append(('{} {:35} {} {} {}'.format(t, name, age, year_joined, position)))

soup = BeautifulSoup(html, "html.parser")
for t in tickers:
html = requests.get(t).text
officer_table = soup.find('table', {"class" : "dataTable"})
you put soup out of the for loop, this will cause a error, because the 'html' dose not exist when you use BeautifulSoup(html, "html.parser")
just put it in the loop after html is assigned.
for t in tickers:
html = requests.get(t).text
soup = BeautifulSoup(html, "html.parser")
officer_table = soup.find('table', {"class" : "dataTable"})

Related

Web Scraping with Python - blank return

I'm trying to scrape reviews from TrustPilot, but the code always return with blank sheets and the headers/categories I specified. Could someone help me with this?
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
driver= webdriver.Chrome()
names=[] #List to store name of the product
headers=[] #List to store price of the product
bodies=[]
ratings=[] #List to store rating of the product
dates=[]
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer('a'))
for a in soup.findAll('a', href=True, attrs={'class':'reviews-container'}):
name=a.find('div', attrs={'class':'consumer-information_name'})
header=a.find('div', attrs={'class':'review-content_title'})
body=a.find('div', attrs={'class':'review-content_text'})
rating=a.find('div', attrs={'class':'star-rating star-rating--medium'})
date=a.find('div', attrs={'class':'review-date--tooltip-target'})
names.append(name.text)
headers.append(header.text)
bodies.append(body.text)
ratings.append(rating.text)
dates.append(date.text)
print ('webpage, no errors')
df = pd.DataFrame({'User Name':names,'Header':headers,'Body':bodies,'Rating':ratings,'Date':dates})
df.to_csv('reviews02.csv', index=False, encoding='utf-8')
print ('csv made')```
The issue is soup.findAll('a', href=True, attrs={'class':'reviews-container'}) is not finding any results, so there are 0 iterations in the loop. Make sure you are using the correct tags and class names. Also you don't need to use a loop because BeautifulSoup has a find_all method. I used the requests module to open the web page, though it shouldn't make a difference.
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
And now each list has 20 entries.

BeautifulSoup doesn't find tr tag with style

I am trying to get the following done:
Open a webpage and get the tr tag with bold font and then filter the links
This works great:
import urllib2
from bs4 import BeautifulSoup
response = urllib2.urlopen("https://wiki.guildwars.com/wiki/Weekly_activities")
data = response.read()
soup = BeautifulSoup(data, 'html.parser')
weekliesunsorted = soup.findAll('tr', style="font-weight: bold;")
pvebonus = weekliesunsorted[0].findAll('a')[0]
#and so on...
But unfortunately this code doesn't, the fluxunsorted variable is empty, but the tr row does exist
response = urllib2.urlopen("https://wiki.guildwars.com/wiki/Flux")
data = response.read()
soup = BeautifulSoup(data, 'html.parser')
fluxunsorted = soup.findAll('tr', style="font-weight:bold;") #empty?
flux = fluxunsorted[0].findAll('a')[0]
That's why I am getting the index out of range error.
Why is the list empty?

Loop through multiple tags in Python BeautifulSoup

I am trying to loop through multiple tags in the HTML so I can print all the IDs.
My code right now prints only first ID, how can I print the second, third, fourth and so on values.
soup = BeautifulSoup(r.content, "html.parser")
product_div = soup.find_all('div', {'class': 'valu '})
product_tag = product_div[0].find('a')
products = product_tag.attrs['val']
print products
This should help
soup = BeautifulSoup(r.content, "html.parser")
for product_div in soup.find_all('div', {'class': 'size '}):
product_tag = product_div.find('a')
if product_tag:
print product_tag.attrs['id']

Iterating over BeautifulSoup object

I am iterating over table that I parsed from html page. I want to iterate over BeautifulSoup object and parse the texts between tag and store them into a list. However, the code below keeps giving me only the very last text from the iteration. How do I add on texts in this problem?
soup = BeautifulSoup(webpage, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
for row in table:
key = []
season = row.find_all("th")
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
Check this:
from bs4 import BeautifulSoup
import requests
url = "https://www.basketball-reference.com/awards/mvp.html"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
key = []
for row in table:
season = row.findAll("th", {'class': 'left'})
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
The only mistake you are doing that in your for loop on every ilteration you empyted your list key=[] i have modified your code little bit and it is giving your desired output.

requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied

I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)

Categories