requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied - python

I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)

Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)

Related

How to list out all the h2, h3, and p tags then create a dataframe to store them

I had given a website to scrape all of the key items
But the output I got is only for one item using BeautifulSoup4. So wonder if I need to use anything like soup.findall to extract all the key items in a list from the website.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url=
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
column= soup.find(class_ = re.compile('columns is-multiline'))
print(column.prettify())
position = column.h2.text
company = column.h3.text
city_state= column.find_all('p')[-2].text
print (position, company, city_state)
Thank you.
Try this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://realpython.github.io/fake-jobs/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
positions = [pos.text for pos in soup.find_all('h2')]
companies = [com.text for com in soup.find_all('h3')]
city_state0 = []
city_state1 = []
for p in soup.find_all('p', {'class' : 'location'}):
city_state0.append(p.text.split(',')[0].strip())
city_state1.append(p.text.split(',')[1].strip())
df = pd.DataFrame({
'city_state1': city_state0,
'city_state2': city_state1,
'companies' : companies,
'positions' : positions
})
print(df)
Output:
You need to use find_all to get all the elements like so. find only gets the first element.
titles = soup.find_all('h2', class_='title is-5')
companies = soup.find_all('h3', class_='subtitle is-6 company')
locations = soup.find_all('p', class_='location')
# loop over locations and extract the city and state
for location in locations:
city = location.split(', ')[0]
state = location.split(', ')[1]

Beautifulsoup : Unable to extract href with several conditions

I'm trying to extract every links with BeautifulSoup from the SEC website such as this one by using the code from this Github. The thing is I do not want to extract every 8-K but only the ones matching the items "2.02" within the column "Description". So i edited the "Download.py" file and identified the following :
while continuation_tag:
r = requests_get(browse_url, params=requests_params)
if continuation_tag == 'first pass':
logger.debug("EDGAR search URL: " + r.url)
logger.info('-' * 100)
data = r.text
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a', {'id': 'documentsbutton'}):
URL = sec_website + link['href']
linkList.append(URL)
continuation_tag = soup.find('input', {'value': 'Next ' + str(count)}) # a button labelled 'Next 100' for example
if continuation_tag:
continuation_string = continuation_tag['onclick']
browse_url = sec_website + re.findall('cgi-bin.*count=\d*', continuation_string)[0]
requests_params = None
return linkList
I've tried to add another loop to match my regex but it doesn't work
for link in soup.find_all('a', {'id': 'documentsbutton'}):
for link in soup.find_all(string=re.compile("items 2.02")):
URL = sec_website + link['href']
linkList.append(URL)
Any helps would be really appreciated, thanks !
First find the tr that encapsulates both the a tag and the td tag that contains the items 2.02 text. Then find the url in the tr if the td actually contains the text items 2.02:
for link in soup.find_all("tr"):
td = link.find('td', {'class': 'small'})
if td:
if 'items 2.02' in td.text:
URL = sec_website + link.find('a', {'id': 'documentsbutton'})['href']
linkList.append(URL)
You can write something more concise by using css pseudo classes. The following looks for td child elements, of parent with class tableFile2, that have an adjacent sibling td (i.e. next column) which is both the third column (nth-of-type) of the table and contains 2.02; from those tds it filters to the child a tags that have id documentsbutton.
import requests
from bs4 import BeautifulSoup as bs # version 4.7.1 +
base = 'https://www.sec.gov'
r = requests.get('https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000320193&type=8-K&dateb=&owner=exclude&start=0&count=40')
soup = bs(r.content, 'lxml') # or html.parser
links = [base + i['href'] for i in soup.select('.tableFile2 td:has(+ td:nth-of-type(3):contains("2.02")) #documentsbutton')]

Web Scraping with Python - blank return

I'm trying to scrape reviews from TrustPilot, but the code always return with blank sheets and the headers/categories I specified. Could someone help me with this?
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
driver= webdriver.Chrome()
names=[] #List to store name of the product
headers=[] #List to store price of the product
bodies=[]
ratings=[] #List to store rating of the product
dates=[]
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer('a'))
for a in soup.findAll('a', href=True, attrs={'class':'reviews-container'}):
name=a.find('div', attrs={'class':'consumer-information_name'})
header=a.find('div', attrs={'class':'review-content_title'})
body=a.find('div', attrs={'class':'review-content_text'})
rating=a.find('div', attrs={'class':'star-rating star-rating--medium'})
date=a.find('div', attrs={'class':'review-date--tooltip-target'})
names.append(name.text)
headers.append(header.text)
bodies.append(body.text)
ratings.append(rating.text)
dates.append(date.text)
print ('webpage, no errors')
df = pd.DataFrame({'User Name':names,'Header':headers,'Body':bodies,'Rating':ratings,'Date':dates})
df.to_csv('reviews02.csv', index=False, encoding='utf-8')
print ('csv made')```
The issue is soup.findAll('a', href=True, attrs={'class':'reviews-container'}) is not finding any results, so there are 0 iterations in the loop. Make sure you are using the correct tags and class names. Also you don't need to use a loop because BeautifulSoup has a find_all method. I used the requests module to open the web page, though it shouldn't make a difference.
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
And now each list has 20 entries.

Python append adding same data

I'm trying to extract the stock price and the market cap data from a Korean website.
Here is my code:
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
if td.find('a'):
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
And this is the result I get. (Sorry for the Korean characters)
['삼성전자', '43,650', '100']
['', '43,650', '100']
['SK하이닉스', '69,800', '5,000']
['', '69,800', '5,000']
The second and the fourth line in the outcome should not be there. I just want the first and the third line. Where do line two and four come from and how do I get rid of them?
My dear friend, I think the problem is you should check if td.find('a').text have values!
So I change your code to this and it works!
import requests
from bs4 import BeautifulSoup
response = requests.get(
'http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {'class': 'type_2'})
data = []
for tr in table.find_all('tr'):
tds = list(tr.find_all('td'))
for td in tds:
# where magic happends!
if td.find('a') and td.find('a').text:
company_name = td.find('a').text
price_now = tds[2].text
market_cap = tds[5].text
data.append([company_name, price_now, market_cap])
print(*data, sep="\n")
While I can't test it, it could be because there are two a tags on the page you're trying to scrape, while your for loop and if statement is set up to append information whenever it finds an a tag. The first one has the name of the company, but the second one has no text, thus the blank output (because you do td.find('a').text, it tries to get the text of the target a tag).
For reference, this is the a tag you want:
삼성전자
This is what you're picking up the second time around:
<img src="https://ssl.pstatic.net/imgstock/images5/ico_debatebl2.gif" width="15" height="13" alt="토론실">
Perhaps you can change your if statement to make sure the class of the a tag is title or something to make sure that you only enter the if statement when you're looking at the a tag with the company name in it.
I'm at work so I can't really test anything, but let me know if you have any questions later!
check tds it should be equal to 13 and no need multiple for loop
import requests
from bs4 import BeautifulSoup
response = requests.get('http://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page=1')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', { 'class': 'type_2' })
data = []
for tr in table.find_all('tr'):
tds = tr.find_all('td')
if len(tds) == 13:
company_name = tds[1].text
price_now = tds[2].text
market_cap = tds[6].text
data.append([company_name, price_now, market_cap])
print(*data, sep = "\n")
result
['삼성전자', '43,650', '2,802,035']
['SK하이닉스', '69,800', '508,146']
['삼성전자우', '35,850', '323,951']
['셀트리온', '229,000', '287,295']
['LG화학', '345,500', '243,897']

Iterating over BeautifulSoup object

I am iterating over table that I parsed from html page. I want to iterate over BeautifulSoup object and parse the texts between tag and store them into a list. However, the code below keeps giving me only the very last text from the iteration. How do I add on texts in this problem?
soup = BeautifulSoup(webpage, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
for row in table:
key = []
season = row.find_all("th")
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
Check this:
from bs4 import BeautifulSoup
import requests
url = "https://www.basketball-reference.com/awards/mvp.html"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
table = soup.find("table",attrs={"id":"mvp_NBA"}).find("tbody").findAll("tr")
key = []
for row in table:
season = row.findAll("th", {'class': 'left'})
for year in season:
y = year.get_text().encode('utf-8')
key.append(y)
print key
The only mistake you are doing that in your for loop on every ilteration you empyted your list key=[] i have modified your code little bit and it is giving your desired output.

Categories