Using pythons parse with criteria - python

First of all I got to say that I have very little experience with any sort of coding so even I dont completely know what Im after here, but Im trying my best!
Ive been writing this code that takes the HTML of a certain website and then gives me .CSV file of the elements(?) that are named (you can see these in the inspect panel of the website).
So my question is, how can I use criteria with my current code so I can tell the code to only return words with, for example, the letter g in them?
Im happy to elaborate!
Thank you already!
import urllib.request
from bs4 import BeautifulSoup
import csv
url = 'https://kouluruoka.fi/menu/kouvola_koulujenruokalista'
request = urllib.request.Request(url)
content = urllib.request.urlopen(request)
parse = BeautifulSoup(content, 'html.parser')
#These texts get words in <h2> and <span> named elements
text1 = parse.find_all('h2')
text2 = parse.find_all('span')
#This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for col1,col2 in zip(text1, text2):
writer.writerow([col1.get_text().strip(), col2.get_text().strip()])

You can check if elements contains some string/letter this way:
h2_elements = parse.find_all('h2')
span_elements = parse.find_all('span')
# This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for h2_element, span_element in zip(h2_elements, span_elements):
h2_element_str = h2_element.get_text().strip()
span_element_str = span_element.get_text().strip()
if 'a' in h2_element_str and 'a' in span_element_str:
writer.writerow([h2_element_str, span_element_str])

Related

Getting a list output to write correctly in rows in a csv file

I am trying to write this output to a csv file but it is simply not working. I have tried many writing to csv tutorials but none of them work. If you could please direct me to tutorial explaining why this isnt working, I would like to learn the issue and solve it.
import bs4
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
import csv
myurl = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38'
uclient = ureq(myurl)
page_html = uclient.read()
uclient.close()
page_soup = soup(page_html, 'html.parser')
items = page_soup.find_all('div', {'class':'item-container'})
#filename = 'zeus.csv'
#f = open(filename, 'w')
#header = 'Item Details\n'
#f.write(header)
#contain = items[0]
#container = items[0]
for container in items:
details = container.a.img['title']
with open('zeus.csv', 'w') as f:
f.write(details + "\n")
#print(details)
You can run
with open('zeus.csv', 'w') as f:
for container in items:
details = container.a.img['title']
f.write("{} \n ".format(details))
The problems that were in the code are that with open('zeus.csv', 'w') as f: was in the loop so in each iteration it is overwritten the previous iterations.
You can try something like that for writing list to .csv file :
import csv
#open file
with open(..., 'w', newline='') as your_file:
writer = csv.writer(your_file, quoting=csv.QUOTE_ALL)
# write your list values
writer.writerow(your_list)

Issues while writing special characters to csv file

I am writing the crawled output of a webpage to CSV files. However few special characters such as 'hyphen' is not getting parsed correctly.
Original Text : Amazon Forecast - Now Generally Available
Result in csv : Amazon Forecast – Now Generally Available
I tried the below code
from bs4 import BeautifulSoup
from datetime import date
import requests
import csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
# csv_file = open('aitrendsresults.csv', 'w')
csv_file = open('aws_cloud_results.csv', 'w' , encoding = 'utf8' )
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title','img','src','summary'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
imgsrc= n.div.img.get('src')
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
anchortxt= titlesrc.a.get('href')
sumtxt= titlesrc.section.p.text
print(sumtxt)
csv_writer.writerow([titletxt,imgsrc,anchortxt,sumtxt])
csv_file.close()
Can you please help me to get the text like the same in original text provided above.
Create a function to handle ASCII characters (i.e. Hyphen, Semicolon) and pass the string as argument inside the function below:
def decode_ascii(string):
return string.encode('ascii', 'ignore').decode('ascii')
input_text = 'Amazon Forecast - Now Generally Available'
output_text = decode_ascii(input_text)
print(output_text)
Output should be Amazon Forecast - Now Generally Available in the CSV.
I've been working with BS as well and I think you've only made a minor mistake. In line 8, where you open the csv file, the encoding should be "UTF-8" instead of "utf8".
See if that helps.
Using title as test the following works for me
from bs4 import BeautifulSoup
import requests, csv
source = requests.get('https://aws.amazon.com/blogs/aws/').text
soup = BeautifulSoup(source, 'lxml')
with open("aws_cloud_results.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(['title'])
match = soup.find_all('div',class_='lb-row lb-snap')
for n in match:
titlesrc= n.find('div',{'class':'lb-col lb-mid-18 lb-tiny-24'})
titletxt= titlesrc.h2.text
w.writerow([titletxt])

Need help writing to a CSV file Python 3.5

My code writes to a CSV file titled 'output' here is a link to past help on this code
When I run my code my CSV file is being rewritten over in the body row. I want to write to a new row every time there is new information being scraped from the table of the stock table URL.
Here is what my CSV file looks like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-3.00,45.18%,5.19M,30.47%,15.78M,-,-,0.00%,2.84M,-16.48%,-14.00M,-,-,1.00%,9.24%,88.82%,18.30M,0.86,-122.00%,136.99%,0.26,88.82%,27.27,0.11,-,-,4.00,-51.44%,0.87,3.51,15.00%,-,1.30 - 8.00,-27.10%,-,-,-15.40%,0.40%,-62.00%,2.73,-,1.10,-16.40%,25.10%,133.85%,0.52,450,1.20,-58.50%,-,53.21,19.81% 17.08%,No,0.37,-,-,5.40,2.96,Yes,0.13,-,-,991.40K,3.04,3.00,1.72%,-6.24%,29.44%,"5,358,503",2.70%
Here is my code:
import csv
import urllib.request
from bs4 import BeautifulSoup
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
for url in url_list:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
#scrape single page and add data to list
#write datalist
with open('output.csv', 'wt') as file:
writer = csv.writer(file)
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
Append mode
The issue is with your command open('output.csv', 'wt') - 'w' option opens the file for (over)writing. If you want to append data at the end of the existing file, use the 'a' option instead, as shown in the fine manual at https://docs.python.org/3.7/library/functions.html#open .
Also, you might want to check if the file exists beforehand and write the header row only if it does not.

CSV | Text stored as elements of a list

I am creating a csv file which gathers several articles scraped from a website. The articles are obtained by scraping the text from URLs contained in another file.
I would like to make the CSV file as a list in which each article corresponds to an element of the list.
The code that I used now is this:
import csv
import requests
from bf4 import BeautifulSoup
with open('Training_news.csv', newline='') as file:
reader= csv.reader (file, delimiter=' ')
for row in reader:
for url in row:
r=requests.get(url)
r.encoding = "ISO-8859-1"
soup = BeautifulSoup(r.content, 'lxml')
text = soup.find_all(("p",{"class": "story-body-text story-content"}))
with open('Training_News_5.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ')
spamwriter.writerow(text)
However, the CSV file created gives me this:
<p>Advertisement</p>, <p class="byline-dateline"><span class="byline" itemprop.......
<p class="feedback-message">We’re interested in your feedback on this page. <strong>Tell us what you think.</strong></p>, <p class="user-action">Go to Home Page »</p>
The articles stored are only three out of 50 and they do not allow me to select each article individually.

Python BeautifulSoup Empty Rows in CSV File

I am working on a scraper to pull street names and zip codes from a site and all of that is working great and it builds a CSV file just fine for me. But when I open the CSV file in Excel the file will have a blank row than a row with a street name with the zip code in the next column just like I want. But next I have a blank row than a row with a street name and zip code beside it. And this just continues on all the way through the file which gives me a row with a street name and zip codes in row then the word none in the next row when imported into the PHPMyAdmin database. I want to get rid of the blank rows. Here is my code.
from bs4 import BeautifulSoup
import csv
import urllib2
url="http://www.conakat.com/states/ohio/cities/defiance/road_maps/"
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
f = csv.writer(open("Defiance Steets1.csv", "w"))
f.writerow(["Street", "Zipcode"]) # Write column headers as the first line
links = soup.find_all('a')
for link in links:
i = link.find_next_sibling('i')
if getattr(i, 'name', None):
a, i = link.string, i.string[1:-1]
f.writerow([a, i])
This worked for me (I added lineterminator ="\n"):
from BeautifulSoup import BeautifulSoup
import csv
import urllib2
url="http://www.conakat.com/states/ohio/cities/defiance/road_maps/"
page=urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
f = csv.writer(open("Defiance Steets1.csv", "w"), lineterminator ="\n")
f.writerow(["Street", "Zipcode"]) # Write column headers as the first line
#print soup.
links = soup.findAll('a')
for link in links:
#i = link.find_next_sibling('i')
i = link.findNextSibling('i')
if getattr(i, 'name', None):
a, i = link.string, i.string[1:-1]
print [a,i]
f.writerow([a, i])
this works for me... thanks
if you have the writer and open in different lines,
put it as a param in the writer function...

Categories