I can't get information with bs4 - python

I'm trying scrape the title, contact information (phone) and webpage from this url:
https://partnerportal.fortinet.com/directory/search?l=Spain&p=1
&p=1 —This is the page. There are 92.
This is my code. I cannot get anything in the print output.
import datetime
import requests
from bs4 import BeautifulSoup
import csv
filename = "fichero" + datetime.datetime.now().strftime("%d-%m-%Y")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Nombre Empresa", "Direccion Empresa", "Telefono Empresa"])
for i in range(1,3):
r = requests.get('https://partnerportal.fortinet.com/directory/search?l=Spain&p='+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_title = soup.select('div.panel panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel panel-default p.locator-partner-info')
array_webpage = soup.find_all('a', class_='locator-parter-site', text=True)
for iterator in range(0, len(array_title)):
title = array_title[iterator].text.strip()
for iterator2 in range(0, len(array_address)):
address = array_address[iterator2].text.strip()
print(title)
print(address)

instead of this
array_title = soup.select('div.panel panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel panel-default p.locator-partner-info')
Try This
array_title = soup.select('div.panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel-default p.locator-partner-info')
and you are printing title in address loop which prints only the last assigned value i.e., Tiws. Better print in title loop only to see the correct result.

To get the data from the website:
url = "https://partnerportal.fortinet.com/directory/search?l=Spain&p=1"
html = bs4.BeautifulSoup(requests.get(url).text, 'lxml')
rows = html.find('div', {'class':'row row-results'})
results = [{
'id': row.find('div', {'class':'col-sm-10'})\
.find('h3').getText(),
'phone':row.find('div', {'class':'partner-info-box'})\
.find('p').getText().split('Phone: ')[1].split('\n')[0],
'url': row.find('div', {'class':'partner-info-box'})\
.find('a').get('href')
} for row in html.find('div', {'class':'row row-results'})\
.find_all('div', {'class':'col-sm-12'})]
In order to save the .json to .csv
import pandas as pd
import datetime
filename = "fichero" + datetime.datetime.now().strftime("%d-%m-%Y")+".csv"
pd.DataFrame.from_dict(results).to_csv(filename, index=False)

Related

Want to Scrap each category individual but either it scraping data in single alphabet form or in a paragraph form

I want to extract Name & Position, Education, Contact number and email all in different column of csv but when I extract it either it is a single block per alphabet or a single column per paragraph(if I list it).Here is the code:
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
page = soup.find_all('p')
for i in page:
i = i.text
with open('page.csv', 'a', encoding = 'utf8', newline='') as f:
thewriter = writer(f)
thewriter.writerow(i)
You can use regex to pull out what you need:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
content = soup.find('div', {'id':'divContent'})
p_list = content.find_all('p')
rows = []
for p in p_list:
string = p.text
text = re.search('(^.*) (Education: )(.*)( Contact).*(\d{3}-\d{3}-\d{4})\s*([a-zA-z1-9].*#[\w].*\.[\w].*)', string).groups()
name = text[0]
edu = text[2]
phone = text[4]
email = text[5]
row = {
'name':name,
'education':edu,
'phone':phone,
'email':email}
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('page.csv', index=False)

Webscraping with BS4 NoneType object has no attribute find

I'm not sure why my code isn't working. I get AttributeError: 'NoneType' object has no attribute 'find'
My code is as follows:
import requests
from bs4 import BeautifulSoup
import csv
root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
html = requests.get(root_url)
soup = BeautifulSoup(html.text, 'html.parser')
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
start_page = paging[1].text
last_page = paging[len(paging)-2].text
outfile = open('congregationlookup.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1,int(last_page)+1))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' %(page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup.prettify())
print ('Processing page: %s' %(page))
name_list = soup.findAll("div",{"class":"views-field views-field-congregation"})
for element in name_list:
name = element.find('h3').text
address = element.find('field-content mb-2').text.strip()
phone = element.find("i",{"class":"fa fa-phone mr-1"}).text.strip()
writer.writerow([name, address, phone])
outfile.close()
print ('Done')
I'm trying to scrape the name, address, and phone number from the URJ Congregations website.
Thank you
Final code
import csv
import requests
from bs4 import BeautifulSoup
# root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
# html = requests.get(root_url)
# soup = BeautifulSoup(html.text, 'html.parser')
# paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
# start_page = paging[1].text
# last_page = paging[len(paging) - 3].text
outfile = open('congregationlookup.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1, 1000))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' % (
page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# print(soup.prettify())
print('Processing page: %s' % (page))
elements = soup.find_all("div", {"class": "views-row"})
if len(elements) == 0:
break
for element in elements:
name = element.find("div", {"class": "views-field views-field-congregation"}).text.strip()
address = element.find("div", {"class": "views-field views-field-country"}).text.strip()
phone = element.find("div", {"class": "views-field views-field-website"}).text.strip().split("\n")[0]
writer.writerow([name, address, phone])
outfile.close()
print('Done')
Most likely, your name_list contains a None type. So, when you attempt to run element.find(), you are performing a string operation on a None, hence your error.
https://docs.python.org/3/library/stdtypes.html#str.find
Also as an FYI, findAll() is bs3 syntax. You should use find_all() Difference between "findAll" and "find_all" in BeautifulSoup
There is a load of problems
The first problem is
"pagination-heading--3"
istead of
"pagination-heading-3"
Next i changed
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
To
paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
This was the line where i swapped first problematic string. And also i changed the second search to find ul. You were trying to find 1 li and searching inside of it. This would have reproduced empty list
Next
last_page = paging[len(paging) - 3].text
as you are trying to get 3rd element from the end
It still doesn't work, i will keep updating

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

Multiple Pages Web Scraping with Python and Beautiful Soup

I'm trying to write a code to scrape some date from pages about hotels. The final information (name of the hotel and address) should be export to csv. The code works but only on one page...
import requests
import pandas as pd
from bs4 import BeautifulSoup # HTML data structure
page_url = requests.get('https://e-turysta.pl/noclegi-krakow/')
soup = BeautifulSoup(page_url.content, 'html.parser')
list = soup.find(id='nav-lista-obiektow')
items = list.find_all(class_='et-list__details flex-grow-1 d-flex d-md-block flex-column')
nazwa_noclegu = [item.find(class_='h3 et-list__details__name').get_text() for item in items]
adres_noclegu = [item.find(class_='et-list__city').get_text() for item in items]
dane = pd.DataFrame(
{
'nazwa' : nazwa_noclegu,
'adres' : adres_noclegu
}
)
print(dane)
dane.to_csv('noclegi.csv')
I tried a loop but doesn't work:
for i in range(22):
url = requests.get('https://e-turysta.pl/noclegi-krakow/'.format(i+1)).text
soup = BeautifulSoup(url, 'html.parser')
Any ideas?
Urls are different then you use - you forgot ?page=.
And you have to use {} to add value to string
url = 'https://e-turysta.pl/noclegi-krakow/?page={}'.format(i+1)
or concatenate it
url = 'https://e-turysta.pl/noclegi-krakow/?page=' + str(i+1)
or use f-string
url = f'https://e-turysta.pl/noclegi-krakow/?page={i+1}'
EDIT: working code
import requests
from bs4 import BeautifulSoup # HTML data structure
import pandas as pd
def get_page_data(number):
print('number:', number)
url = 'https://e-turysta.pl/noclegi-krakow/?page={}'.format(number)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
container = soup.find(id='nav-lista-obiektow')
items = container.find_all(class_='et-list__details flex-grow-1 d-flex d-md-block flex-column')
# better group them - so you could add default value if there is no nazwa or adres
dane = []
for item in items:
nazwa = item.find(class_='h3 et-list__details__name').get_text(strip=True)
adres = item.find(class_='et-list__city').get_text(strip=True)
dane.append([nazwa, adres])
return dane
# --- main ---
wszystkie_dane = []
for number in range(1, 23):
dane_na_stronie = get_page_data(number)
wszystkie_dane.extend(dane_na_stronie)
dane = pd.DataFrame(wszystkie_dane, columns=['nazwa', 'adres'])
dane.to_csv('noclegi.csv', index=False)
in your loop you use the .format() function but need to insert the brackets into the string you are formatting.
for i in range(22):
url = requests.get('https://e-turysta.pl/noclegi-krakow/{}'.format(i+1)).text
soup = BeautifulSoup(url, 'html.parser')

save data as new line but in a single cell lxml python

i want the data like this...
"Basic jersey
Does what it says on the tin
Main: 100% Cotton."
in a single cell but i'm getting the data like this...
"Basic jerseyDoes what it says on the tinMain: 100% Cotton."
THIS IS THE HTML
<div class="about-me">
<h4>ABOUT ME</h4>
<span><div>Basic jersey</div><div>Does what it says on the tin</div><br>Main: 100% Cotton.</span>
</div>
THIS IS MY CODE
from selenium import webdriver
from lxml import html
import pandas as pd
import collections, os
from bs4 import BeautifulSoup
def Save_to_Csv(data):
filename = 'data.csv'
df = pd.DataFrame(data)
df.set_index('Title', drop=True, inplace=True)
if os.path.isfile(filename):
with open(filename,'a') as f:
df.to_csv(f, mode='a', sep=",", header=False, encoding='utf-8')
else:
df.to_csv(filename, sep=",", encoding='utf-8')
with open('urls.txt', 'r') as f:
links = [link.strip() for link in f.readlines()]
driver = webdriver.Chrome()
for urls in links:
global image
driver.get(urls)
source = driver.page_source
tree = html.fromstring(source)
data = BeautifulSoup(source, 'html.parser')
imgtag = data.find_all('li', attrs={'class':'image-thumbnail'})
image = []
for imgsrc in imgtag:
image.append(imgsrc.img['src'].replace('?$S$&wid=40&fit=constrain', '?$XXL$&wid=513&fit=constrain'))
title = tree.xpath('string(.//div/h1)')
price = tree.xpath('string(.//span[#class="current-price"])')
sku = tree.xpath('string(.//div[#class="product-code"]/span)')
aboutme = tree.xpath(('string(.//div[#class="about-me"]/span)'))
foundings = collections.OrderedDict()
foundings['Title'] = [title]
foundings['Price'] = [price]
foundings['Product_Code'] = [sku]
foundings['Abouy_Me'] = [aboutme]
foundings['Image'] = [image]
Save_to_Csv(foundings)
print title, price, sku, aboutme, image
driver.close()
Using the HTML you have given, you can solve this using the stripped_strings generator as follows:
from bs4 import BeautifulSoup
html = """
<div class="about-me">
<h4>ABOUT ME</h4>
<span><div>Basic jersey</div><div>Does what it says on the tin</div><br>Main: 100% Cotton.</span>
</div>"""
soup = BeautifulSoup(html, "html.parser")
print('\n'.join(soup.span.stripped_strings))
This would get each component in a stripped list and then join them together with a newline:
Basic jersey
Does what it says on the tin
Main: 100% Cotton.

Categories