writing beautiful soup output to CSV

writing beautiful soup output to CSV - python

I want to write prices and corresponding addresses to a CSV file in Excel. I have this code so far which gives the output shown below in the photo.
What I want is a column for price first and a column for the address second.
[![from bs4 import BeautifulSoup
import requests
import csv
number = "1"
url = "http://www.trademe.co.nz/browse/categoryattributesearchresults.aspx?cid=5748&search=1&v=list&134=1&nofilters=1&originalsidebar=1&key=1654466070&page=" + number + "&sort_order=prop_default&rptpath=350-5748-3399-"
r= requests.get(url)
soup = BeautifulSoup(r.content)
output_file= open("output.csv","w")
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
n = 1
while n != 150:
b = (price\[n\].text)
b = str(b)
n = n + 1
output_file.write(b)
output_file.close()][1]][1]

Maybe something like this?
from bs4 import BeautifulSoup
import requests
import csv
....
r = requests.get(url)
soup = BeautifulSoup(r.content)
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
dataset = [(x.text, y.text) for x,y in zip(price, address)]
with open("output.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
for data in dataset[:150]: #truncate to 150 rows
writer.writerow(data)

There are a few problems with your code. Getting the prices and addresses into separate lists risks the site switching the order of the items, etc. and getting them mixed up. When scraping entries like this it is important to first find the larger enclosing container, then narrow down from there.
Unfortunately the URL you provided is no longer valid. As such I just browsed to another set of listings for this example:
from bs4 import BeautifulSoup
import requests
import csv
url = 'http://www.trademe.co.nz/property/residential-property-for-sale'
url += '/waikato/view-list'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open('output.csv', 'w', newline='') as csvfile:
propertyWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for listing in soup.find_all('div',
{'class': 'property-list-view-card'}):
price = listing.find_all('div',
{'class': 'property-card-price-container'})
address = listing.find_all('div',
{'class': 'property-card-subtitle'})
propertyWriter.writerow([price[0].text.strip(),
address[0].text.strip()])

Related

How to scrape the website properly and getting all td texts from website

I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)

All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:

Index out of range in Python while finding tr tags using BeautifulSoup

So I am trying to crawl the below data.
And the problem is that I don't know how many tr is in the website so I just said range(0, 24). However I am pretty sure that it has at least 24. But the code still says it's out of range.
How do I crawl this website and get all the information (the bilingual text), even if I don't know how many rows there are?
Below is my code.
from bs4 import BeautifulSoup
import requests
url="http://www.mongols.eu/mongolian-language/mongolian-tale-six-silver-stars/"
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
gdp_table = soup.find("table", attrs={"class": "table-translations"})
gdp_table_data = gdp_table.tbody.find_all("tr") # contains # rows
for i in range(0, 24):
for td in gdp_table_data[i].find_all("td"):
headings = []
headings.append(td.get_text(strip=True))
print(headings[1], " | ", headings[2])

You already iterate over each element in gdp_table_data[i].find_all("td"). Use the same idea for the row iteration
for tr in gdp_table_data:
for td in tr.find_all("td"):
...

I guess this is the best solution to save it as a csv:
import pandas as pd
dfs = pd.read_html('http://www.mongols.eu/mongolian-language/mongolian-tale-six-silver-stars/')
df = pd.concat(dfs)
df.to_csv('a.csv')
saves a csv file (a.csv) with the data.
Or only printing:
import requests
from bs4 import BeautifulSoup
r =requests.get('http://www.mongols.eu/mongolian-language/mongolian-tale-six-silver-stars/')
soup = BeautifulSoup(r.content, 'html.parser')
trs = soup.select('table.table-translations tr')
for tr in trs:
print(tr.get_text())
prints:
No.
Mongolian text
Loosely translated into English
1.
Зургаан мөнгөн мичид
Six silver stars
2.
Эрт урьд цагт зургаан өнчин хүүхэд товцог толгой дээр наадан суудаг юм санжээ.
Long ago, there were six orphan brothers playing on the top of a hill.
Тэгсэн чинь ах нь нэг өдөр хэлж:
One day the oldest brother said:
and so on...

This script will write all translations to data.csv:
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.mongols.eu/mongolian-language/mongolian-tale-six-silver-stars/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for row in soup.select('.table-translations tr')[1:]:
mongolian, english = map(lambda t: t.get_text(strip=True), row.select('td')[1:])
all_data.append((mongolian, english))
with open('data.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
spamwriter.writerow(row)
Creates:

How would I extract username, post, and date posted from discussion board?

How would I proceed in this web scraping project using bs4 and requests? I am trying to extract user info from a forum site (myfitnesspal exactly: https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1), specifically the username, message, and date posted, and load them into columns on a csv. I have this code so far but am unsure about how to proceed:
from bs4 import BeautifulSoup
import csv
import requests
# get page source and create a BS object
print('Reading page...')
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
src = page.content
soup = BeautifulSoup(src, 'html.parser')
#container = soup.select('#vanilla_discussion_index > div.container')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul')
postdata = soup.select('div.Message')
user = []
date = []
text = []
for post in postdata:
text.append(BeautifulSoup(str(post), 'html.parser').get_text().encode('utf-8').strip())
print(text) # this stores the text of each comment/post in a list,
# so next I'd want to store this in a csv with columns
# user, date posted, post with this under the post column
# and do the same for user and date

This script will get all messages from the page and saves them in data.csv:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for u, d, m in zip(soup.select('.Username'), soup.select('.DateCreated'), soup.select('.Message')):
all_data.append([u.text, d.get_text(strip=True),m.get_text(strip=True, separator='\n')])
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
writer.writerow(row)
Screenshot from LibreOffice:

One rule of thumb I like to follow with web scraping is being specific as possible without picking up unnecessary information. So for example, if I want to select a username I inspect the element containing the information I need:
<a class="Username" href="...">Username</a>
Since I am trying to collect usernames it makes the most sense to select by the class "Username":
soup.select("a.Username")
This gives me a list of all the usernames that are found on the page, this is great, however, if we want to select the data in "packages" (by post in your example we need to collect each post individually.
To accomplish this you could do something like the following:
comments = soup.select("div.comment")
This will make it easier to then do the following:
with open('file.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['user', 'date', 'text']
for comment in comments:
username = comment.select_one("div.Username")
date = comment.select_one("span.BodyDate")
message = comment.select_one("div.Message")
writer.writerow([username, date, message])
Doing it this way also makes sure your data stays in order even if an element is missing.

Here you go:
from bs4 import BeautifulSoup
import csv
import requests
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
soup = BeautifulSoup(page.content, 'html.parser')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul > li')
with open('data.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=['user', 'date', 'text'])
writer.writeheader()
for comment in container:
writer.writerow({
'user': comment.find('a', {'class': 'Username'}).get_text(),
'date': comment.find('span', {'class': 'BodyDate DateCreated'}).get_text().strip(),
'text': comment.find('div', {'class': 'Message'}).get_text().strip()
})

Beautiful Soup script not delivering desired CSV output

I am new to scraping/BS4 and am having a problem getting this csv file to list all of the members. My problem is the CSV is listing one member's information in repeat over multiple lines. If anyone has any ideas to fix this, would be greatly appreciated.
import requests
import csv
from bs4 import BeautifulSoup
r = requests.get('https://vermontmaple.org/basic-member-list')
soup = BeautifulSoup(r.text, 'html.parser')
with open('list.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['name', 'address', 'phone'])
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = soup.find('div', class_='name').get_text(strip=True)
maple_address = soup.find('div', class_='address').get_text(strip=True)
maple_phone = soup.find('div', class_='phone').get_text(strip=True)
writer.writerow([maple_name, maple_address, maple_phone])
f.close()

change soup.find to company.find inside the forloop
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = company.find('div', class_='name').get_text(strip=True)
maple_address = company.find('div', class_='address').get_text(strip=True)
maple_phone = company.find('div', class_='phone').get_text(strip=True)
there is no need for a f.close()

Scraping tweets from wapo gone awry

Two issues.
Goal etl a 3 columns csv with column headings of date, time, & tweet.
My attempts at extracting the span text/time out of the li results in duplicating the span info inside the time and tweet columns.
It's my first week working with python, i've tried to replace() the tweet columns 'time' with "" but I end up removing both columns 'time' instances.
combining the columns together in-order or correctly mixing the data columns together as they appear. The code I write either results in 30,000 or 1000 lines. The correct csv file should be around 520 lines.
import bs4 as bs
import requests, urllib.request, csv
from urllib.request import urlopen
sauce = urllib.request.urlopen('https://www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858').read()
soup = bs.BeautifulSoup(sauce, 'html.parser')
lists = soup.find_all('li', class_='visible')
dates = soup.find_all("li", attrs={"data-date": True})
tweet_data = ['date, time, tweets']
for li in dates[1:]:
date = li['data-date']
tweet_data.append([date])
for list in lists[1:]:
time = list.find_all('span', {"class": "gray"})[0].text
tweets = list.text
tweet_data.append([time, tweets])
with open('tweets_attempt_8.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(tweet_data)

Here is code for which you needed to you out put...
I hope you are satisfy with this answers.
import bs4 as bs
import urllib2,csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
url='www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858'
sauce = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"})
con = urllib2.urlopen(sauce)
data = con.read()
soup = bs.BeautifulSoup(data, 'html.parser')
lists = soup.find_all('li', class_='visible')
dates = soup.find_all("li", attrs={"data-date": True})
tweet_data = ['date, time, tweets']
for li,list in zip(dates[1:],lists[1:]):
date = li['data-date']
time = list.find_all('span', {"class": "gray"})[0].text
tweets = list.text
tweet_data.append([date,time, tweets])
with open('/tmp/tweets_attempt_8.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(tweet_data)
As you want the Out Put look at this

Try this. There are 504 lines in that page which you want to parse. You will get all of them with a csv output.
import csv ; import requests ; from bs4 import BeautifulSoup
with open('tweets_attempt_8.csv', 'w', newline='', encoding='utf8') as outfile:
writer = csv.writer(outfile)
writer.writerow(['date','time','tweets'])
sauce = requests.get('https://www.washingtonpost.com/graphics/politics/100-days-of-trump-tweets/?utm_term=.0c2052f6d858',headers={"User-Agent":"Existed"}).text
soup = BeautifulSoup(sauce,"html.parser")
for item in soup.select("li.pg-excerpt.visible"):
date = item.get('data-date')
time = item.select("span.gray")[0].text
title = item.text.strip()
print(date, time, title[10:])
writer.writerow([date, time, title[10:]])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

writing beautiful soup output to CSV - python

Related

How to scrape the website properly and getting all td texts from website

Index out of range in Python while finding tr tags using BeautifulSoup

How would I extract username, post, and date posted from discussion board?

Beautiful Soup script not delivering desired CSV output

Scraping tweets from wapo gone awry

Categories

Resources