What I'm trying to do is
Take multiple URLs.
Take h2 text in every URL.
Merge h2 texts and then write csv.
In this code, I did:
Take one URL. Take h2 text in URL.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://example.com/ekonomi/20200108/"
#i am trying to do | urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/]
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
out_filename = "output.csv"
headers = "title \n"
f = open(out_filename, "w")
f.write(headers)
container = containers[0]
for container in containers:
title = container.h2.get_text()
f.write(title.replace(",", " ") + "\n")
f.close() # Close the file
Provided your iteration through the containers is correct, this should work:
You want to iterate through the urls. Each url will grab the title, and append it into a list. Then just create a series with that list and write to csv with Pandas:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/']
titles = []
for page_url in urls:
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
for container in containers:
titles.append(container.h2.get_text())
df = pd.DataFrame(titles, columns=['title'])
df.to_csv("output.csv", index=False)
Related
Hey how can I change this code to enter each page and get the info from this url I want ( the book name and the url of the book )
i wrote ( with google help ) this code but i want to get all the books from all the pages ( 50 pages )
# import web grabbing client and
# HTML parser
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import requests
# variable to store website link as string
booksURL = 'http://books.toscrape.com/'
# grab website and store in variable urlClient
urlClient = uReq(booksURL)
# read and close HTML
page_html = urlClient.read()
urlClient.close()
# call BeautifulSoup for parsing
page_soup = soup(page_html, "html.parser")
# grabs all the products under list tag
bookshelf = page_soup.findAll(
"li", {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})
for books in bookshelf:
# collect title of all books
book_title = books.h3.a["title"]
book_url = books.find("a")["href"]
#books_url = books.h3.a["url"]
print(book_title + "-" +booksURL+book_url)
i tried to add this code but i dont know how to add it to my
for i in range(51): # Number of pages plus one
url = "https://books.toscrape.com/catalogue/page-{}.html".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
This might work. I have removed uReq because I prefer using requests ;)
# import web grabbing client and
# HTML parser
from bs4 import BeautifulSoup as soup
import requests
for i in range(1, 51): # Number of pages plus one
url = "https://books.toscrape.com/catalogue/page-{}.html".format(i)
response = requests.get(url)
# call BeautifulSoup for parsing
page_soup = soup(response.content, "html.parser")
# grabs all the products under list tag
bookshelf = page_soup.findAll(
"li", {"class": "col-xs-6 col-sm-4 col-md-3 col-lg-3"})
for books in bookshelf:
# collect title of all books
book_title = books.h3.a["title"]
book_url = books.find("a")["href"]
print(book_title + " - " + book_url)
I'm having trouble scraping multiple URLs. Essentially I'm able to run this for only one genre, but the second I include other links it stops working.
The goal is to get the data and place it into a csv file with the movie title, url, and genre. Any help would be appreciated!
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html,"html.parser")
containers = page_soup.findAll("li",{"class":"nm-content-horizontal-row-item"})
# name the output file to write to local disk
out_filename = "netflixaction2.csv"
# header of csv file to be written
headers = "Movie_Name, Movie_ID \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
for container in containers:
title_container = container.findAll("a",{"class":"nm-collections-title nm-collections-link"})
title_container = title_container[0].text
movieid = container.findAll("a",{"class":"nm-collections-title nm-collections-link"})
movieid = movieid[0].attrs['href']
print("Movie Name: " + title_container, "\n")
print("Movie ID: " , movieid, "\n")
f.write(title_container + ", " + movieid + "\n")
f.close() # Close the file
The reason you are getting the error is that you trying to do a GET requests on a list.
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
uClient = uReq(my_url)
what I suggest to do here is to loop through each link etc:
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
for link in my_url:
uClient = uReq(link)
page_html = uClient.read()
....
and to mention, if you are just applying the code for the loop, it will override your f.write function. What you need to do is something like:
New edit:
import csv
import requests
from bs4 import BeautifulSoup as soup
# All given URLS
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
# Create and open CSV file
with open("netflixaction2.csv", 'w', encoding='utf-8') as csv_file:
# Headers for CSV
headers_for_csv = ['Movie Name', 'Movie Link']
# Small function for csv DictWriter
csv_writer = csv.DictWriter(csv_file, delimiter=',', lineterminator='\n', fieldnames=headers_for_csv)
csv_writer.writeheader()
# We need to loop through each URL from the list
for link in my_url:
# Do a simple GET requests with the URL
response = requests.get(link)
page_soup = soup(response.text, "html.parser")
# Find all nm-content-horizontal-row-item
containers = page_soup.findAll("li", {"class": "nm-content-horizontal-row-item"})
# Loop through each found "li"
for container in containers:
movie_name = container.text.strip()
movie_link = container.find("a")['href']
print(f"Movie Name: {movie_name} | Movie link: {movie_link}")
# Write to CSV
csv_writer.writerow({
'Movie Name': movie_name,
'Movie Link': movie_link,
})
# Close the file
csv_file.close()
That should be your solution :) Feel free to comment if i'm missing something!
I've been working on this web scraper for a while and trying to get the body content of different links of an online newsletter. Therefore, if I breakdown the code for the second loop and run it separately, it will return the correct results, however, if the same part is put inside a loop in the bigger script, it will return the error "IndexError: list index out of range".
This is the script that 2nd loop returns the error (UPDATED):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
def pages(myurl):
# opening up connection, grabbing the page
uClient = uReq(myurl)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
dt = []
ttl = []
name = []
body = []
source = []
# grabs each newsletter subjects
titular = page_soup.findAll("div",{"class":"col-md-9 col-sm-9 col-xs-9"})
titular[0]
tit1 = titular[0]
fixed = 'https://www.df.cl/noticias/site/tax/port/all'
for tit1 in titular:
date = tit1.span.text
dt.append(date)
title = tit1.h2.a.text
ttl.append(title)
link = tit1.h2.a["href"].strip()
source.append(fixed + link)
df = pd.DataFrame(dt, columns =['date'])
df['title'] = ttl
df['link'] = source
for link in df['link']:
new_link = fixed + link
page = uReq(new_link)
page_html_1 = page.read()
page.close()
page_soup = soup(page_html_1, "html.parser")
content = page_soup.findAll("div",{"class":"entry cuerpo-noticias CUERPO"})
content[0].text
cont1 = content[0].text
body.append(cont1)
df['content'] = body
print(df)
#df.to_csv(u'C:/Users/snpw9/Desktop/Scripts/sample_scrap.csv', mode='a', header=False, index=False)
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_230__1.html') #Banca y Fintech
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_20__1.html') #Bolsa y Monedas
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_226__1.html') #Pensiones
pages('https://www.df.cl/noticias/site/tax/port/all/taxport_3_228__1.html') #Seguros
It would be very helpful to make this part work, hopefully, with your help!
The second script without the loop (which work properly):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://www.df.cl/noticias/site/tax/port/all/noticias/mercados/banca-fintech/bancoestado-
destina-90-millones-para-los-gastos-de-internet-de-sus/2020-07-07/152240.html'
#def pages(myurl):
# opening up connection, grabbing the page
uClient = uReq(myurl)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each newsletter subjects
content = page_soup.findAll("div",{"class":"entry cuerpo-noticias CUERPO"})
cont1 = content[0].text
print(cont1)
how can I get more data form more than one page into my csv file
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1','https://software-overzicht.nl/amersfoort?page=2' ).text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv','w')
csv_writter = csv.writer(csv_file)
csv_writter.writerow(['naambedrijf', 'adress'])
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text
adress = search.p.text
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
print(title)
csv_writter.writerow([title,adress])
csv_file.close()`
You just need to move your requests.get() and that whole process into your loop of the page range:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
print(title)
csv_writter.writerow([title,adress])
I am trying to loop through multiple pages and my code doesn't extract anything. I am kind of new to scraping so bear with me. I made a container so I can target each listing. I also made a variable to target the anchor tag that you would press to go to the next page. I would really appreciate any help I could get. Thanks.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
for page in range(0,25):
file = "breakfeast_chicago.csv"
f = open(file, "w")
Headers = "Nambusiness_name, business_address, business_city, business_region, business_phone_number\n"
f.write(Headers)
my_url = 'https://www.yellowpages.com/search?search_terms=Stores&geo_location_terms=Chicago%2C%20IL&page={}'.format(page)
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each listing
containers = page_soup.findAll("div",{"class": "result"})
new = page_soup.findAll("a", {"class":"next ajax-page"})
for i in new:
try:
for container in containers:
b_name = i.find("container.h2.span.text").get_text()
b_addr = i.find("container.p.span.text").get_text()
city_container = container.findAll("span",{"class": "locality"})
b_city = i.find("city_container[0].text ").get_text()
region_container = container.findAll("span",{"itemprop": "postalCode"})
b_reg = i.find("region_container[0].text").get_text()
phone_container = container.findAll("div",{"itemprop": "telephone"})
b_phone = i.find("phone_container[0].text").get_text()
print(b_name, b_addr, b_city, b_reg, b_phone)
f.write(b_name + "," +b_addr + "," +b_city.replace(",", "|") + "," +b_reg + "," +b_phone + "\n")
except: AttributeError
f.close()
If using BS4 try : find_all
Try dropping into a trace using import pdb;pdb.set_trace() and try to debug what is being selected in the for loop.
Also, some content may be hidden if it is loaded via javascript.
Each anchor tag or href for "clicking" is just another network request, and if you plan to follow the link consider slowing down the number of requests in between each request, so you don't get blocked.
You can try like the below script. It will traverse different pages through pagination and collect name and phone numbers from each container.
import requests
from bs4 import BeautifulSoup
my_url = "https://www.yellowpages.com/search?search_terms=Stores&geo_location_terms=Chicago%2C%20IL&page={}"
for link in [my_url.format(page) for page in range(1,5)]:
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".info"):
try:
name = item.select(".business-name [itemprop='name']")[0].text
except Exception:
name = ""
try:
phone = item.select("[itemprop='telephone']")[0].text
except Exception:
phone = ""
print(name,phone)