I have a loop that I am not sure how to close so I can save the output locally as json. My print output looks fine but I am just getting the last line of my output on my json save with the following:
from urllib.request import Request, urlopen
req = Request('https://www.website.com', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
html = webpage.decode('utf8')
type(html)
from bs4 import BeautifulSoup
htmlsoup = BeautifulSoup(html, 'html.parser')
type(htmlsoup)
enter code hereanchors = htmlsoup.find_all('a')
type(anchors)
for a in anchors:
print(a.text)
filename = 'website.json'
import json
with open(filename, 'w', encoding='utf-8') as f:
json.dump(a.text, f, ensure_ascii=False, indent=4)
Related
My code below works but I want it to do the same exact thing but with the next page of the URL variable, this would be done by adding the number 1,2,3 depending on the page.
The code essentially scrapes a website that has the thumnails of various videos, it then returns the link to each video. I want it to do this for each page available
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
from urllib.request import Request, urlopen
URL = "domain.com/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
endof = soup.find_all('div',class_="th-image")
links = [a['href'] for a in soup.find_all('a', href=True)]
endoflinks = links[8:-8]
index = 0
for a in endoflinks:
index+=1
dwnlink = "domain.com"+ endoflinks[index]
r = requests.get(dwnlink)
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
This should help you get going:
URL = "domain.com/"
for i in list(range(0,10)):
print("domain.com/"+str(i))
r = requests.get(URL+str(i))
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
domain.com/0
domain.com/1
domain.com/2
domain.com/3
domain.com/4
domain.com/5
domain.com/6
domain.com/7
domain.com/8
domain.com/9
I'm trying to write a list to a csv file such that the it comes out looking like this
I'm sure I'm not using the CSV library correctly since it prints each character of just the first link to the file Here's my code:
for t in terms:
fields = ["Search Term", "URL"]
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
for item in soup.find_all("item"):
link= str(item)
i = link.find("<link/>")
j = link.find("<guid")
links = link[i+7:j]
with open("urls.csv", "w") as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(links)
Any help would be so appreciated. Thanks!!
Use xml parser when creating the soup:
import csv
import requests
from bs4 import BeautifulSoup
terms = ["refrigerator", "kitchen sink"]
with open("urls.csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(["Search Term", "URL"])
for t in terms:
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
print(f"Getting {url}")
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, "xml")
for item in soup.find_all("link"):
writer.writerow([t, item.get_text(strip=True)])
Creates urls.csv (screenshot from LibreOffice):
Help me please,,
the code I made only works for 1 page, I want it for all pages. what should I do?
import csv
import urllib3
from bs4 import BeautifulSoup
outfile = open("data.csv","w",newline='')
writer = csv.writer(outfile)
for i in range(1,20) :
url = f'http://ciumi.com/cspos/barcode-ritel.php?page={i}'
req = urllib3.PoolManager()
res = req.request('GET', url)
tree = BeautifulSoup(res.data, 'html.parser')
table_tag = tree.select("table")[0]
tab_data = [[item.text for item in row_data.select("th,td")]
for row_data in table_tag.select("tr")]
for data in tab_data:
writer.writerow(data)
print( res, url, ' '.join(data))
Your code is working well, if you want to scrape all the uri and get data from them you just have to correctly indent it:
import csv
import urllib3
from bs4 import BeautifulSoup
outfile = open("data.csv","w",newline='')
writer = csv.writer(outfile)
for i in range(1,20) :
url = f'http://ciumi.com/cspos/barcode-ritel.php?page={i}'
req = urllib3.PoolManager()
res = req.request('GET', url)
tree = BeautifulSoup(res.data, 'html.parser')
table_tag = tree.select("table")[0]
tab_data = [[item.text for item in row_data.select("th,td")] for row_data in table_tag.select("tr")]
for data in tab_data:
writer.writerow(data)
print( res, url, ' '.join(data))
But you have to clean the data to have a pretty csv file
I am planning to use a url list to scrape several pages consecutively, using the code below.
Is there a smart way to replace the manually inserted terms for "desired_google_queries" through a reference to an extensive url list (which could be a CSV or Excel file)?
from bs4 import BeautifulSoup
import urllib.request
import csv
desired_google_queries = ['Word' , 'lifdsst', 'yvou', 'should', 'load']
for query in desired_google_queries:
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
print(resultStats)
with open('queries.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(['query', 'resultStats'])
for query in desired_google_queries:
...
spamwriter.writerow([query, resultStats])
You can put your scraping logic into a function, and then call it on each of the query's you read from your .csv file.
from bs4 import BeautifulSoup
import urllib.request
import csv
def scrape_site(query):
url = 'http://google.com/search?q=' + query
req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"})
response = urllib.request.urlopen( req )
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
resultStats = soup.find(id="resultStats").string
return resultStats
#####################################################
# Read in queries from .csv to desired_google_queries
with open('queries.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(['query', 'resultStats'])
for query in desired_google_queries:
resultStats = scrape_site(query)
spamwriter.writerow([query, resultStats])
from bs4 import BeautifulSoup
import requests
import os
url = "http://nos.nl/artikel/2093082-steeds-meer-nekklachten-bij-kinderen-door-gebruik-tablets.html"
r = requests.get(url)
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'))
data = soup.find_all("article", {"class": "article"})
with open("data1.txt", "wb") as file:
content=‘utf-8’
for item in data:
content+='''{}\n{}\n\n{}\n{}'''.format( item.contents[0].find_all("time", {"datetime": "2016-03-16T09:50:30+0100"})[0].text,
item.contents[0].find_all("a", {"class": "link-grey"})[0].text,
item.contents[0].find_all("img", {"class": "media-full"})[0],
item.contents[1].find_all("div", {"class": "article_textwrap"})[0].text,
)
with open("data1.txt".format(file_name), "wb") as file:
file.write(content)
Recently solved a utf/Unicode problem but now it isn't saving it as a .txt file nor saving it at all. What do I need to do?
If you want to write the data as UTF-8 to the file try codecs.open like:
from bs4 import BeautifulSoup
import requests
import os
import codecs
url = "http://nos.nl/artikel/2093082-steeds-meer-nekklachten-bij-kinderen-door-gebruik-tablets.html"
r = requests.get(url)
soup = BeautifulSoup(r.content)
data = soup.find_all("article", {"class": "article"})
with codecs.open("data1.txt", "wb", "utf-8") as filen:
for item in data:
filen.write(item.contents[0].find_all("time", {"datetime": "2016-03-16T09:50:30+0100"})[0].get_text())
filen.write('\n')
filen.write(item.contents[0].find_all("a", {"class": "link-grey"})[0].get_text())
filen.write('\n\n')
filen.write(item.contents[0].find_all("img", {"class": "media-full"})[0].get_text())
filen.write('\n')
filen.write(item.contents[1].find_all("div", {"class": "article_textwrap"})[0].get_text())
I'm unsure about filen.write(item.contents[0].find_all("img", {"class": "media-full"})[0]) because that returned a Tag instance for me.