Sourcing Python BeautifulSoup Requests from URL List - python

I am planning to use a url list to scrape several pages consecutively, using the code below.
Is there a smart way to replace the manually inserted terms for "desired_google_queries" through a reference to an extensive url list (which could be a CSV or Excel file)?
from bs4 import BeautifulSoup
import urllib.request
import csv
desired_google_queries = ['Word' , 'lifdsst', 'yvou', 'should', 'load']
for query in desired_google_queries:
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
print(resultStats)
with open('queries.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(['query', 'resultStats'])
for query in desired_google_queries:
...
spamwriter.writerow([query, resultStats])

You can put your scraping logic into a function, and then call it on each of the query's you read from your .csv file.
from bs4 import BeautifulSoup
import urllib.request
import csv
def scrape_site(query):
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
return resultStats
#####################################################
# Read in queries from .csv to desired_google_queries
with open('queries.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(['query', 'resultStats'])
for query in desired_google_queries:
resultStats = scrape_site(query)
spamwriter.writerow([query, resultStats])

Related

Save Beautiful Soup Output to JSON

I have a loop that I am not sure how to close so I can save the output locally as json. My print output looks fine but I am just getting the last line of my output on my json save with the following:
from urllib.request import Request, urlopen
req = Request('https://www.website.com', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
html = webpage.decode('utf8')
type(html)
from bs4 import BeautifulSoup
htmlsoup = BeautifulSoup(html, 'html.parser')
type(htmlsoup)
enter code hereanchors = htmlsoup.find_all('a')
type(anchors)
for a in anchors:
print(a.text)
filename = 'website.json'
import json
with open(filename, 'w', encoding='utf-8') as f:
json.dump(a.text, f, ensure_ascii=False, indent=4)

Write list values to CSV as they pertain to current iteration

I'm trying to write a list to a csv file such that the it comes out looking like this
I'm sure I'm not using the CSV library correctly since it prints each character of just the first link to the file Here's my code:
for t in terms:
fields = ["Search Term", "URL"]
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
for item in soup.find_all("item"):
link= str(item)
i = link.find("<link/>")
j = link.find("<guid")
links = link[i+7:j]
with open("urls.csv", "w") as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(links)
Any help would be so appreciated. Thanks!!
Use xml parser when creating the soup:
import csv
import requests
from bs4 import BeautifulSoup
terms = ["refrigerator", "kitchen sink"]
with open("urls.csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(["Search Term", "URL"])
for t in terms:
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
print(f"Getting {url}")
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, "xml")
for item in soup.find_all("link"):
writer.writerow([t, item.get_text(strip=True)])
Creates urls.csv (screenshot from LibreOffice):

Scrape table html multipage with beautifulsoup4 and urllib3

Help me please,,
the code I made only works for 1 page, I want it for all pages. what should I do?
import csv
import urllib3
from bs4 import BeautifulSoup
outfile = open("data.csv","w",newline='')
writer = csv.writer(outfile)
for i in range(1,20) :
url = f'http://ciumi.com/cspos/barcode-ritel.php?page={i}'
req = urllib3.PoolManager()
res = req.request('GET', url)
tree = BeautifulSoup(res.data, 'html.parser')
table_tag = tree.select("table")[0]
tab_data = [[item.text for item in row_data.select("th,td")]
for row_data in table_tag.select("tr")]
for data in tab_data:
writer.writerow(data)
print( res, url, ' '.join(data))
Your code is working well, if you want to scrape all the uri and get data from them you just have to correctly indent it:
import csv
import urllib3
from bs4 import BeautifulSoup
outfile = open("data.csv","w",newline='')
writer = csv.writer(outfile)
for i in range(1,20) :
url = f'http://ciumi.com/cspos/barcode-ritel.php?page={i}'
req = urllib3.PoolManager()
res = req.request('GET', url)
tree = BeautifulSoup(res.data, 'html.parser')
table_tag = tree.select("table")[0]
tab_data = [[item.text for item in row_data.select("th,td")] for row_data in table_tag.select("tr")]
for data in tab_data:
writer.writerow(data)
print( res, url, ' '.join(data))
But you have to clean the data to have a pretty csv file

writing beautiful soup output to CSV

I want to write prices and corresponding addresses to a CSV file in Excel. I have this code so far which gives the output shown below in the photo.
What I want is a column for price first and a column for the address second.
[![from bs4 import BeautifulSoup
import requests
import csv
number = "1"
url = "http://www.trademe.co.nz/browse/categoryattributesearchresults.aspx?cid=5748&search=1&v=list&134=1&nofilters=1&originalsidebar=1&key=1654466070&page=" + number + "&sort_order=prop_default&rptpath=350-5748-3399-"
r= requests.get(url)
soup = BeautifulSoup(r.content)
output_file= open("output.csv","w")
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
n = 1
while n != 150:
b = (price\[n\].text)
b = str(b)
n = n + 1
output_file.write(b)
output_file.close()][1]][1]
Maybe something like this?
from bs4 import BeautifulSoup
import requests
import csv
....
r = requests.get(url)
soup = BeautifulSoup(r.content)
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
dataset = [(x.text, y.text) for x,y in zip(price, address)]
with open("output.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
for data in dataset[:150]: #truncate to 150 rows
writer.writerow(data)
There are a few problems with your code. Getting the prices and addresses into separate lists risks the site switching the order of the items, etc. and getting them mixed up. When scraping entries like this it is important to first find the larger enclosing container, then narrow down from there.
Unfortunately the URL you provided is no longer valid. As such I just browsed to another set of listings for this example:
from bs4 import BeautifulSoup
import requests
import csv
url = 'http://www.trademe.co.nz/property/residential-property-for-sale'
url += '/waikato/view-list'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open('output.csv', 'w', newline='') as csvfile:
propertyWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for listing in soup.find_all('div',
{'class': 'property-list-view-card'}):
price = listing.find_all('div',
{'class': 'property-card-price-container'})
address = listing.find_all('div',
{'class': 'property-card-subtitle'})
propertyWriter.writerow([price[0].text.strip(),
address[0].text.strip()])

writing info to csv in python

import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
base_url = 'http://www.baseball-reference.com/' # base url for concatenation
data = requests.get("http://www.baseball-reference.com/teams/BAL/2014-schedule-scores.shtml") #website for scraping
soup = BeautifulSoup(data.content)
b=5
for link in soup.find_all('a'):
if not link.has_attr('href'):
continue
if link.get_text() != 'boxscore':
continue
url = base_url + link['href']
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
# Scores
table = soup.find('table', attrs={'id': 'BaltimoreOriolespitching'})
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
for list in list_of_cells:
with open('test1.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(list)
I am trying to write the info scraped to a csv so that each piece of information has its own cell. The more I play with the code I either get an indentation error or the first row prints to a csv and thats it.
IndentationError: expected an indented block
I think the first thing to consider is moving opening the file and creating the CSV writer outside the loop. I think you're overwriting the CSV file ('w') on each pass through the for loop. So try this:
with open('test1.csv', 'w', newline='') as fp:
csvw = csv.writer(fp, delimiter=',')
for link in soup.find_all('a'):
if not link.has_attr('href'):
continue
if link.get_text() != 'boxscore':
continue
url = base_url + link['href']
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
# Scores
table = soup.find('table', attrs={'id': 'BaltimoreOriolespitching'})
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
for list in list_of_cells:
csvw.writerows(list)

Categories