Using multiple replace function in Beautifulsoup - python

I have been dealing with something, but it didn't work no matter what i tried. I need to use multiple replace function, howewer python allowed me to use it only one time.
It's my csv output. (https://i.stack.imgur.com/HtBSn.png)]
Firstly, there are values which seem as N/A. it has to be 0 or something, briefly, should be string.
Secondly, there are space in some countries name. Like North Macedonia it shouldn't be there.
`
import csv
import requests
from bs4 import BeautifulSoup
from csv import QUOTE_NONE
from csv import writer
response = requests.get('https://www.worldometers.info/coronavirus/#news').content
soup = BeautifulSoup(response,'lxml')
tbody=soup.find('table', id='main_table_countries_today').find('tbody').find_all('tr')[100:110]
with open('corona1.csv','w', newline='') as csv_file:
csv_writer = writer(csv_file, escapechar=' ', quoting=csv.QUOTE_NONE)
csv_writer.writerow(['countries','total_cases','total_deaths','total_recovered','active_cases','total_cases_in_1m','deaths_in_1m','population'])
for value in tbody:
countries = value.find_all('td')[1].text.replace(",", "").strip()
total_cases= value.find_all('td')[2].text.replace(",", "").strip()
total_deaths=value.find_all('td')[4].text.replace(",", "").strip()
total_recovered=value.find_all('td')[6].text.replace(",", "").strip()
active_cases=value.find_all('td')[8].text.replace(",", "").strip()
total_cases_in_1m=value.find_all('td')[10].text.replace(",", "").strip()
deaths_in_1m=value.find_all('td')[11].text.replace(",", "").strip()
population=value.find_all('td')[14].text.replace(",", "").strip()
csv_writer.writerow([countries,total_cases,total_deaths,total_recovered,active_cases,total_cases_in_1m,deaths_in_1m,population])
this is my current python code. what should i change?
i would like to have something like
total_recovered=value.find_all('td')[6].text.replace(",", "").replace("N/A","0").replace(" ","").strip()

Edit:
I this code works for me. The repetitive work I excluded into a method and call it in the csv.writerow
import csv
import requests
from bs4 import BeautifulSoup
from csv import QUOTE_NONE
from csv import writer
response = requests.get('https://www.worldometers.info/coronavirus/#news').content
soup = BeautifulSoup(response,'lxml')
tbody=soup.find('table', id='main_table_countries_today').find('tbody').find_all('tr')[100:110]
replacement = {
",": "",
"N/A": "0",
"\n": "",
" ": ""
}
def cleanup(webcontent, indecies):
out = []
for index in indecies:
content = webcontent.find_all('td')[index].text
for k in [*replacement]:
content = content.replace(k,replacement[k])
out.append(content)
return out
with open('corona1.csv','w') as csv_file:
csv_writer = writer(csv_file, escapechar=' ', quoting=csv.QUOTE_NONE)
csv_writer.writerow(['countries','total_cases','total_deaths','total_recovered','active_cases','total_cases_in_1m','deaths_in_1m','population'])
for value in tbody:
csv_writer.writerow(cleanup(value, [1,2,4,6,8,10,11,14]))
Note: If you try to open the file in excel it is not correct formatted but for most other Programs and Apis it is. You have to change the separator in excel. Have a look here Import or export text(.txt or .csv) in Excel.

Related

Using pythons parse with criteria

First of all I got to say that I have very little experience with any sort of coding so even I dont completely know what Im after here, but Im trying my best!
Ive been writing this code that takes the HTML of a certain website and then gives me .CSV file of the elements(?) that are named (you can see these in the inspect panel of the website).
So my question is, how can I use criteria with my current code so I can tell the code to only return words with, for example, the letter g in them?
Im happy to elaborate!
Thank you already!
import urllib.request
from bs4 import BeautifulSoup
import csv
url = 'https://kouluruoka.fi/menu/kouvola_koulujenruokalista'
request = urllib.request.Request(url)
content = urllib.request.urlopen(request)
parse = BeautifulSoup(content, 'html.parser')
#These texts get words in <h2> and <span> named elements
text1 = parse.find_all('h2')
text2 = parse.find_all('span')
#This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for col1,col2 in zip(text1, text2):
writer.writerow([col1.get_text().strip(), col2.get_text().strip()])
You can check if elements contains some string/letter this way:
h2_elements = parse.find_all('h2')
span_elements = parse.find_all('span')
# This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for h2_element, span_element in zip(h2_elements, span_elements):
h2_element_str = h2_element.get_text().strip()
span_element_str = span_element.get_text().strip()
if 'a' in h2_element_str and 'a' in span_element_str:
writer.writerow([h2_element_str, span_element_str])

Issues while writing special characters to csv file

I am writing the crawled output of a webpage to CSV files. However few special characters such as 'hyphen' is not getting parsed correctly.
Original Text : Amazon Forecast - Now Generally Available
Result in csv : Amazon Forecast – Now Generally Available
I tried the below code
from bs4 import BeautifulSoup
from datetime import date
import requests
import csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
# csv_file = open('aitrendsresults.csv', 'w')
csv_file = open('aws_cloud_results.csv', 'w' , encoding = 'utf8' )
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title','img','src','summary'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
imgsrc= n.div.img.get('src')
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
anchortxt= titlesrc.a.get('href')
sumtxt= titlesrc.section.p.text
print(sumtxt)
csv_writer.writerow([titletxt,imgsrc,anchortxt,sumtxt])
csv_file.close()
Can you please help me to get the text like the same in original text provided above.
Create a function to handle ASCII characters (i.e. Hyphen, Semicolon) and pass the string as argument inside the function below:
def decode_ascii(string):
return string.encode('ascii', 'ignore').decode('ascii')
input_text = 'Amazon Forecast - Now Generally Available'
output_text = decode_ascii(input_text)
print(output_text)
Output should be Amazon Forecast - Now Generally Available in the CSV.
I've been working with BS as well and I think you've only made a minor mistake. In line 8, where you open the csv file, the encoding should be "UTF-8" instead of "utf8".
See if that helps.
Using title as test the following works for me
from bs4 import BeautifulSoup
import requests, csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
with open("aws_cloud_results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(['title'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
w.writerow([titletxt])

CSV indentation and Proper language in Python

I want to return the list that comes out with the print function into a CSV list.
import re
import requests
from bs4 import BeautifulSoup
for i in range(146):
r = requests.get(("http://www.yellowpages.com/atlanta-ga/trends/{}").format(i))
soup = BeautifulSoup(r.content , "html.parser")
for link in soup.find_all("a",href=re.compile('/atlanta-ga/')):
if 'trends' not in link.get('href'):
link = (link.get('href'))
results = (("http://www.yellowpages.com{}?page=").format(link))
import csv
with open('Catagories', 'w') as myfile:
wr = csv.writer(myfile)
wr.writerow([results])
print(results)
The purpose of this should be very apparent
#tdelaney is right: every time you open the file with "w", you're overwritting the previous text.
The fix is to use "a" instead:
with open('Catagories', 'a') as myfile:
...
Check out the docs: https://docs.python.org/3/library/functions.html#open

python give column name and write value in separate column as table

my code
from lxml import html
import requests
import csv
# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# example site
page = requests.get('http://www.wintergreenfund.com/reports/top-ten/')
tree = html.fromstring(page.text)
#This will create a list of services:
tname = tree.xpath('//*[#id="colLeft"]//table//tr/td[1]/text()')
tvalue = tree.xpath('//table//tr/td[2]/text()')
print tname
print tvalue
print 'Input the csv file'
csvfile = raw_input("> ")
res = tname,tvalue
#Assuming res is a list of lists
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(res)
my output in csv
Reynolds American Inc. Consolidated-Tomoka Land Co. British American Tobacco
8.30% 7.50% 7.10% 6.60% 6.40% 5.90% 5.30% 4.80% 4.70% 4.10%
Required output same as in website with coulmn name
Ref http://www.wintergreenfund.com/reports/top-ten/
And also unicode is not working .need help on this
my new code
from lxml import html
import requests
import csv
page = requests.get('http://www.wintergreenfund.com/reports/top-ten/')
tree = html.fromstring(page.text)
csvrows = []
for rows in tree.xpath('//*[#id="colLeft"]//table//tr'):
csvrows.append([rows.xpath('./td[1]/text()'),rows.xpath('./td[2]/text()')])
print csvrows
print 'Input the csv file'
csvfile = raw_input("> ")
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(['Name','Value']) #substitute as appropriate.
writer.writerows(csvrows)
I am getting value with [' '] in it and also empty [ ]
First thing , if you want to combine two lists at each corresponding index , you should use zip() , currently you are creating a tuple of two lists in line - res = tname,tvalue - and then writing it as is to the csv.
Also, secondly, you should first use xpath to get each row in the table, and then use xpath to get each required td element from it. Rather than using two xpaths as you are using currently.
Example -
from lxml import html
import requests
import csv
page = requests.get('http://www.wintergreenfund.com/reports/top-ten/')
tree = html.fromstring(page.text)
csvrows = []
for rows in tree.xpath('//*[#id="colLeft"]//table//tr'):
row1text = rows.xpath('./td[1]/text()')
row2text = rows.xpath('./td[2]/text()')
if row1text and row2text:
csvrows.append([row1text[0],row2text[0]])
print(csvrows)
print('Input the csv file')
csvfile = input("> ")
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(['Name','Value']) #substitute as appropriate.
writer.writerows(csvrows)

Python BeautifulSoup Empty Rows in CSV File

I am working on a scraper to pull street names and zip codes from a site and all of that is working great and it builds a CSV file just fine for me. But when I open the CSV file in Excel the file will have a blank row than a row with a street name with the zip code in the next column just like I want. But next I have a blank row than a row with a street name and zip code beside it. And this just continues on all the way through the file which gives me a row with a street name and zip codes in row then the word none in the next row when imported into the PHPMyAdmin database. I want to get rid of the blank rows. Here is my code.
from bs4 import BeautifulSoup
import csv
import urllib2
url="http://www.conakat.com/states/ohio/cities/defiance/road_maps/"
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
f = csv.writer(open("Defiance Steets1.csv", "w"))
f.writerow(["Street", "Zipcode"]) # Write column headers as the first line
links = soup.find_all('a')
for link in links:
i = link.find_next_sibling('i')
if getattr(i, 'name', None):
a, i = link.string, i.string[1:-1]
f.writerow([a, i])
This worked for me (I added lineterminator ="\n"):
from BeautifulSoup import BeautifulSoup
import csv
import urllib2
url="http://www.conakat.com/states/ohio/cities/defiance/road_maps/"
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
f = csv.writer(open("Defiance Steets1.csv", "w"), lineterminator ="\n")
f.writerow(["Street", "Zipcode"]) # Write column headers as the first line
#print soup.
links = soup.findAll('a')
for link in links:
#i = link.find_next_sibling('i')
i = link.findNextSibling('i')
if getattr(i, 'name', None):
a, i = link.string, i.string[1:-1]
print [a,i]
f.writerow([a, i])
this works for me... thanks
if you have the writer and open in different lines,
put it as a param in the writer function...

Categories