BeautifulSoup Error in file saving .txt - python

from bs4 import BeautifulSoup
import requests
import os
url = "http://nos.nl/artikel/2093082-steeds-meer-nekklachten-bij-kinderen-door-gebruik-tablets.html"
r = requests.get(url)
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'))
data = soup.find_all("article", {"class": "article"})
with open("data1.txt", "wb") as file:
content=‘utf-8’
for item in data:
content+='''{}\n{}\n\n{}\n{}'''.format( item.contents[0].find_all("time", {"datetime": "2016-03-16T09:50:30+0100"})[0].text,
item.contents[0].find_all("a", {"class": "link-grey"})[0].text,
item.contents[0].find_all("img", {"class": "media-full"})[0],
item.contents[1].find_all("div", {"class": "article_textwrap"})[0].text,
)
with open("data1.txt".format(file_name), "wb") as file:
file.write(content)
Recently solved a utf/Unicode problem but now it isn't saving it as a .txt file nor saving it at all. What do I need to do?

If you want to write the data as UTF-8 to the file try codecs.open like:
from bs4 import BeautifulSoup
import requests
import os
import codecs
url = "http://nos.nl/artikel/2093082-steeds-meer-nekklachten-bij-kinderen-door-gebruik-tablets.html"
r = requests.get(url)
soup = BeautifulSoup(r.content)
data = soup.find_all("article", {"class": "article"})
with codecs.open("data1.txt", "wb", "utf-8") as filen:
for item in data:
filen.write(item.contents[0].find_all("time", {"datetime": "2016-03-16T09:50:30+0100"})[0].get_text())
filen.write('\n')
filen.write(item.contents[0].find_all("a", {"class": "link-grey"})[0].get_text())
filen.write('\n\n')
filen.write(item.contents[0].find_all("img", {"class": "media-full"})[0].get_text())
filen.write('\n')
filen.write(item.contents[1].find_all("div", {"class": "article_textwrap"})[0].get_text())
I'm unsure about filen.write(item.contents[0].find_all("img", {"class": "media-full"})[0]) because that returned a Tag instance for me.

Related

Cannot scrape email data but can scrape name using BeautifulSoup

import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.asta.org/membership/directory-search-details?memId=900312276"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
data = []
for item in soup.find_all("div", {"class": "asta-member"}):
name = item.find("h4", {"class": "asta-member__name"}).text
data.append([name])
for item in soup.find_all("div", {"class": "asta-member_title"}):
email = item.find("div")[2].text
data.append([email])
with open("contacts.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "email"])
writer.writerows(data)
inspect data
script scrapes the name data as its in its own class, but cannot understand what class/other directory to put in script to scrape email
I have tried everything I can think of
Suggestion:
Since the div for email has no class or id you can use the .contents method in beautiful soup. This method lists all the tag contents.
In this case, we will get all the content on "class": "asta-member" since the email div is under it and extract the email text.
Code:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.asta.org/membership/directory-search-details?memId=900312276"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
soup
data = []
for item in soup.find_all("div", {"class": "asta-member"}):
name = item.find("h4", {"class": "asta-member__name"}).text
data.append(name)
#Code changes here
for item in soup.find_all("div", {"class": "asta-member"}):
email = item.contents[7].text.strip()
data.append(email)
with open("contacts.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "email"])
writer.writerow(data)
#reading the CSV file
df = pd.read_csv('/content/contacts.csv')
df
Output:
Reference:
https://beautiful-soup-4.readthedocs.io/en/latest/#contents-and-children

How can I change the code to make it such that the html tags do not appear

from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt)
with the above code, i get the output:
[<*h1 class="celeb-name">Ayden Sng</h1*>] #asterisks added to show h1 tags
What do i need to change in my code or how can i make it such that i only get 'Ayden Sng' as my output?
Iterate over each entry of the txt list and extract its txt property:
txt = [element.text for element in txt] # ['Ayden Sng']
Repl.it
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt[0].text)
if there are more than one reuslt you can use this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
for i in txt:
print(i.text)

How do I scrape data from URLs in a python-scraped list of URLs?

I'm trying to use BeautifulSoup4 in Orange to scrape data from a list of URLs scraped from that same website.
I have managed to scraped the data from a single page when I set the URL manually.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zone-points.aspx?year=2021&zone=1&section=1901"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
for child in rank.children:
print(url,child)
and I have been able to scrape the list of URLs I need
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
link = soup.find('div',class_='contentSection')
url_list = link.find('a').get('href')
for url_list in link.find_all('a'):
print (url_list.get('href'))
But so far I haven't been able to combine both to scrape the data from that URL list. Can I do that only by nesting for loops, and if so, how? Or how can I do it?
I am sorry if this is a stupid question, but I only started trying with Python and Web-Scraping yesterday and I have not been able to figure this by consulting similar-ish topics.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# get all links
url_list = []
for a in soup.find("div", class_="contentSection").find_all("a"):
url_list.append(a["href"].replace("§", "&sect"))
# get all data from URLs
all_data = []
for url in url_list:
print(url)
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
h2 = soup.h2
sub = h2.find_next("p")
for tr in soup.select("tr:has(td)"):
all_data.append(
[
h2.get_text(strip=True),
sub.get_text(strip=True),
*[td.get_text(strip=True) for td in tr.select("td")],
]
)
# save data to CSV
df = pd.DataFrame(
all_data,
columns=[
"title",
"sub_title",
"Rank",
"Horse / Owner",
"Points",
"Total Comps",
],
)
print(df)
df.to_csv("data.csv", index=None)
This traverses all URLs and saves all data to data.csv (screenshot from LibreOffice):

how can I apply for loop through more than one pages and add each page to csv file

how can I get more data form more than one page into my csv file
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1','https://software-overzicht.nl/amersfoort?page=2' ).text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv','w')
csv_writter = csv.writer(csv_file)
csv_writter.writerow(['naambedrijf', 'adress'])
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text
adress = search.p.text
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
print(title)
csv_writter.writerow([title,adress])
csv_file.close()`
You just need to move your requests.get() and that whole process into your loop of the page range:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
print(title)
csv_writter.writerow([title,adress])

isolate 'td a' tag based on class using beautiful soup

I'd like to write the url links in this url into a file but there are 2 'td a' tags for each line on the table. I just want the one where a class="pagelink" href="/search" etc.
I tried the following code, hoping to pick up only the ones where "class":"pagelink", but produced an error:
AttributeError: 'Doctype' object has no attribute 'find_all'
Can anyone help please?
import requests
from bs4 import BeautifulSoup as soup
import csv
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
url = https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=1000&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results
response = session.get(url) #not used until after the iteration begins
html = soup(response.text, 'lxml')
for link in html:
prop_link = link.find_all("td a", {"class":"pagelink"})
writer.writerow([prop_link])
Your html variable contains a Doctype object which is not iterable.
You'll need to use find_all or select in that object to find the nodes that you want.
Example:
import requests
from bs4 import BeautifulSoup as soup
import csv
outputfilename = 'Ed_Streets2.csv'
#inputfilename = 'Edinburgh.txt'
baseurl = 'https://www.saa.gov.uk'
outputfile = open(outputfilename, 'wb')
writer = csv.writer(outputfile)
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
session = requests.session()
url = "https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=100&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results"
response = session.get(url)
html = soup(response.text, 'lxml')
prop_link = html.find_all("a", class_="pagelink button small")
for link in prop_link:
prop_url = baseurl+(link["href"])
print prop_url
writer.writerow([prop_url, "", "", ""])
Try this.
You need to look for the links before starting the loop.
import requests
from bs4 import BeautifulSoup as soup
import csv
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
url = "https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=1000&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results"
response = requests.get(url) #not used until after the iteration begins
html = soup(response.text, 'lxml')
prop_link = html.find_all("a", {"class":"pagelink button small"})
for link in prop_link:
if(type(link) != type(None) and link.has_attr("href")):
wr = link["href"]
writer.writerow([wr])

Categories