I am writing the crawled output of a webpage to CSV files. However few special characters such as 'hyphen' is not getting parsed correctly.
Original Text : Amazon Forecast - Now Generally Available
Result in csv : Amazon Forecast – Now Generally Available
I tried the below code
from bs4 import BeautifulSoup
from datetime import date
import requests
import csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
# csv_file = open('aitrendsresults.csv', 'w')
csv_file = open('aws_cloud_results.csv', 'w' , encoding = 'utf8' )
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title','img','src','summary'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
imgsrc= n.div.img.get('src')
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
anchortxt= titlesrc.a.get('href')
sumtxt= titlesrc.section.p.text
print(sumtxt)
csv_writer.writerow([titletxt,imgsrc,anchortxt,sumtxt])
csv_file.close()
Can you please help me to get the text like the same in original text provided above.
Create a function to handle ASCII characters (i.e. Hyphen, Semicolon) and pass the string as argument inside the function below:
def decode_ascii(string):
return string.encode('ascii', 'ignore').decode('ascii')
input_text = 'Amazon Forecast - Now Generally Available'
output_text = decode_ascii(input_text)
print(output_text)
Output should be Amazon Forecast - Now Generally Available in the CSV.
I've been working with BS as well and I think you've only made a minor mistake. In line 8, where you open the csv file, the encoding should be "UTF-8" instead of "utf8".
See if that helps.
Using title as test the following works for me
from bs4 import BeautifulSoup
import requests, csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
with open("aws_cloud_results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(['title'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
w.writerow([titletxt])
Related
I have been dealing with something, but it didn't work no matter what i tried. I need to use multiple replace function, howewer python allowed me to use it only one time.
It's my csv output. (https://i.stack.imgur.com/HtBSn.png)]
Firstly, there are values which seem as N/A. it has to be 0 or something, briefly, should be string.
Secondly, there are space in some countries name. Like North Macedonia it shouldn't be there.
`
import csv
import requests
from bs4 import BeautifulSoup
from csv import QUOTE_NONE
from csv import writer
response = requests.get('https://www.worldometers.info/coronavirus/#news').content
soup = BeautifulSoup(response,'lxml')
tbody=soup.find('table', id='main_table_countries_today').find('tbody').find_all('tr')[100:110]
with open('corona1.csv','w', newline='') as csv_file:
csv_writer = writer(csv_file, escapechar=' ', quoting=csv.QUOTE_NONE)
csv_writer.writerow(['countries','total_cases','total_deaths','total_recovered','active_cases','total_cases_in_1m','deaths_in_1m','population'])
for value in tbody:
countries = value.find_all('td')[1].text.replace(",", "").strip()
total_cases= value.find_all('td')[2].text.replace(",", "").strip()
total_deaths=value.find_all('td')[4].text.replace(",", "").strip()
total_recovered=value.find_all('td')[6].text.replace(",", "").strip()
active_cases=value.find_all('td')[8].text.replace(",", "").strip()
total_cases_in_1m=value.find_all('td')[10].text.replace(",", "").strip()
deaths_in_1m=value.find_all('td')[11].text.replace(",", "").strip()
population=value.find_all('td')[14].text.replace(",", "").strip()
csv_writer.writerow([countries,total_cases,total_deaths,total_recovered,active_cases,total_cases_in_1m,deaths_in_1m,population])
this is my current python code. what should i change?
i would like to have something like
total_recovered=value.find_all('td')[6].text.replace(",", "").replace("N/A","0").replace(" ","").strip()
Edit:
I this code works for me. The repetitive work I excluded into a method and call it in the csv.writerow
import csv
import requests
from bs4 import BeautifulSoup
from csv import QUOTE_NONE
from csv import writer
response = requests.get('https://www.worldometers.info/coronavirus/#news').content
soup = BeautifulSoup(response,'lxml')
tbody=soup.find('table', id='main_table_countries_today').find('tbody').find_all('tr')[100:110]
replacement = {
",": "",
"N/A": "0",
"\n": "",
" ": ""
}
def cleanup(webcontent, indecies):
out = []
for index in indecies:
content = webcontent.find_all('td')[index].text
for k in [*replacement]:
content = content.replace(k,replacement[k])
out.append(content)
return out
with open('corona1.csv','w') as csv_file:
csv_writer = writer(csv_file, escapechar=' ', quoting=csv.QUOTE_NONE)
csv_writer.writerow(['countries','total_cases','total_deaths','total_recovered','active_cases','total_cases_in_1m','deaths_in_1m','population'])
for value in tbody:
csv_writer.writerow(cleanup(value, [1,2,4,6,8,10,11,14]))
Note: If you try to open the file in excel it is not correct formatted but for most other Programs and Apis it is. You have to change the separator in excel. Have a look here Import or export text(.txt or .csv) in Excel.
First of all I got to say that I have very little experience with any sort of coding so even I dont completely know what Im after here, but Im trying my best!
Ive been writing this code that takes the HTML of a certain website and then gives me .CSV file of the elements(?) that are named (you can see these in the inspect panel of the website).
So my question is, how can I use criteria with my current code so I can tell the code to only return words with, for example, the letter g in them?
Im happy to elaborate!
Thank you already!
import urllib.request
from bs4 import BeautifulSoup
import csv
url = 'https://kouluruoka.fi/menu/kouvola_koulujenruokalista'
request = urllib.request.Request(url)
content = urllib.request.urlopen(request)
parse = BeautifulSoup(content, 'html.parser')
#These texts get words in <h2> and <span> named elements
text1 = parse.find_all('h2')
text2 = parse.find_all('span')
#This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for col1,col2 in zip(text1, text2):
writer.writerow([col1.get_text().strip(), col2.get_text().strip()])
You can check if elements contains some string/letter this way:
h2_elements = parse.find_all('h2')
span_elements = parse.find_all('span')
# This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for h2_element, span_element in zip(h2_elements, span_elements):
h2_element_str = h2_element.get_text().strip()
span_element_str = span_element.get_text().strip()
if 'a' in h2_element_str and 'a' in span_element_str:
writer.writerow([h2_element_str, span_element_str])
Lovely people! I'm totally new with Python. I tried to scrape several URLs and encountered a problem with "print".
I tried to print and write the "shipment status".
I have two URLs, so ideally I get two results.
This is my code:
from bs4 import BeautifulSoup
import re
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
for p in shipment:
# extract information
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
I have two problems here:
Problem one: I have only two URLs, and when I print the results, every "span" is repeated 4 times (as there are four "span"s).
The result in the "output" is as below:
(I deleted the result example to protect privacy.)
Problem two: I tried to write the "print" to a text file, but only one line appeared in the file:
(I deleted the result example to protect privacy.)
I want to know what is wrong in the code. I want to print 2 url results only.
Your help is really appreciated!
Thank you in advance!
First point is caused by iterating over shipment - Just delete the for loop and correct indent of print():
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second issue is caused while you call the writing outside the loop and not in append mode - You will end up with this as your loop:
#open file in append mode
with open('somefile.txt', 'a') as f:
#start iterating your urls
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
#create output text
line = f'{url};Preparation{Preparation.getText()};Sent{Sent.getText()};InTransit{InTransit.getText()};Delivered{Delivered.getText()}'
#print output text
print (line)
#append output text to file
f.write(line+'\n')
And you can delete:
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Example of a bit optimized code:
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
with open('somefile.txt', 'a', encoding='utf-8') as f:
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = list(soup.select_one('#progress').stripped_strings)
line = f"{url},{';'.join([':'.join(x) for x in list(zip(shipment[::2], shipment[1::2]))])}"
print (line)
f.write(line+'\n')
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
There are four spans actuelly, try this
for url in line_in_list:
soup = BeautifulSoup(urlopen(url).read(), 'html')
# parse something special in the file
shipments = soup.find_all("span") # there are four span actually;
sys.stdout.write('Url '+url+'; Preparation'+shipments[0].getText()+'; Sent'+shipments[1].getText()+'; InTransit'+shipments[2].getText()+'; Delivered'+shipments[3].getText())
# change line
sys.stdout.write("\r")
First question
You have two nested loops :
for url in line_in_list:
for p in shipment:
print(...)
The print is nested in the second loop. If you have 4 shipments per url, that will lead to 4 prints per url.
Since you don't use p from for p in shipment you can completely get rid of the second loop and move the print one indentation level left, like this :
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second question
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Without keyword argument, print is writing to sys.stdout, which is by default your terminal output. There's only one print after sys.sdtout = ... so there will only be one line written to the file.
There's another way to print to a file :
with open('demo.txt', 'a') as f:
print('Hello world', file = f)
The keyword with will ensure the file is closed even if an exception is raised.
Both combined
From what I understood, you want to print two lines to the file. Here's a solution :
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), "html")
# parse something special in the file
shipment = soup.find_all("span")
Preparation = shipment[0]
Sent = shipment[1]
InTransit = shipment[2]
Delivered = shipment[3]
with open(file_path, "a") as f:
f.write(
f"{url} ; Preparation {Preparation.getText()}; Sent {Sent.getText()}; InTransit {InTransit.getText()}; Delivered {Delivered.getText()}"
)
I wrote a code to scrape articles from a particular website so that I can put the csv created from this code to Geneea (text analysis program). The problem is that I wrote this code using unicode, but I then realized I need to scrape the text with accented characters. This code as it is gives me the output I need, but to have text with accented characters is vital for the text analysis program. Do you have any suggestions on how I can change this code?
Thank you all very much!
My code is below:
import requests
from bs4 import BeautifulSoup
import json
import csv
from unidecode import unidecode
def datetonumeric(stringdate):
spliteddate=stringdate.split()
data=(spliteddate[0]).split(".")
day=int(data[0])
themonth=int(data[1])
year=int(data[2])
if(themonth >10 or themonth < 7):
return ""
if(themonth==10 and day>7):
return ""
return f'{day}/{themonth}/{year}'
count=0
with open('parlamentnilistyoutput.csv', 'w', newline='', encoding="UTF-8") as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
spamwriter.writerow(['id_clanku','zdroj','datum','title','perex','text','url'])
for i in range(20,70):
r=requests.get(f'https://www.parlamentnilisty.cz/special/Volby%202021?p={i}')
soup = BeautifulSoup(r.text, 'html.parser')
articles=soup.select(".articles-list ul.list-unstyled li")
for article in articles:
try:
# id=(json.loads(article['data-track-list']))['item']['id']
id_clanku=f'PA000{count+1}'
urlselector=(article.select("a"))
url=f"https://www.parlamentnilisty.cz{(urlselector[0])['href']}"
r=requests.get(url)
soup=BeautifulSoup(r.text, 'html.parser')
dateselector=soup.select('div.time')
date=(dateselector[0]).get_text()
date=datetonumeric(date)
print(date)
if(date!=""):
titleselector=soup.select('.article-header h1')
title=titleselector[0].get_text()
title=title.replace(","," ")
pretextselector=soup.select("p.brief")
pretext=pretextselector[0].get_text()
pretext=pretext.replace(","," ")
alltext=soup.select('.article-content > p')
maintext=""
for text in alltext:
maintext=maintext + (text.get_text()).replace("\n"," ")
maintext=maintext+"\n"
maintext=maintext.replace(","," ")
spamwriter.writerow([id_clanku,'parlamentnilisty',date,unidecode(title),unidecode(pretext),unidecode(maintext),url])
count=count+1
except:
print("wrong request")
break
I want to return the list that comes out with the print function into a CSV list.
import re
import requests
from bs4 import BeautifulSoup
for i in range(146):
r = requests.get(("http://www.yellowpages.com/atlanta-ga/trends/{}").format(i))
soup = BeautifulSoup(r.content , "html.parser")
for link in soup.find_all("a",href=re.compile('/atlanta-ga/')):
if 'trends' not in link.get('href'):
link = (link.get('href'))
results = (("http://www.yellowpages.com{}?page=").format(link))
import csv
with open('Catagories', 'w') as myfile:
wr = csv.writer(myfile)
wr.writerow([results])
print(results)
The purpose of this should be very apparent
#tdelaney is right: every time you open the file with "w", you're overwritting the previous text.
The fix is to use "a" instead:
with open('Catagories', 'a') as myfile:
...
Check out the docs: https://docs.python.org/3/library/functions.html#open