Cannot export python script output into the same csv file - python

#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
import csv
class GetFeeds(object):
def main(self):
self.malc0de()
self.malwaredomainlist()
def malc0de(self):
url=requests.get('http://malc0de.com/rss/')
feed=url.content
soup=BeautifulSoup(feed,'html.parser')
with open("feeds_123.csv", "w") as f:
writer = csv.writer(f, delimiter=";")
for link in soup.find_all('item'):
desc = link.find('description').contents
formatted_desc = desc[0].split(",")
formatted_desc_contents = [cont.split(":")[1] for cont in formatted_desc]
print formatted_desc_contents
writer.writerow(formatted_desc_contents)
def malwaredomainlist(self):
url=requests.get('http://www.malwaredomainlist.com/hostslist/mdl.xml')
feed2=url.content
soup=BeautifulSoup(feed2,'html.parser')
##print soup.prettify()
with open("feeds_123.csv", "w") as b:
writer = csv.writer(b, delimiter=";")
for link in soup.find_all('item'):
desc = link.find('description').contents
formatted_desc = desc[0].split(",")
formatted_desc_contents = [cont.split(":")[1] for cont in formatted_desc]
print formatted_desc_contents
writer.writerow(formatted_desc_contents)
if __name__ == "__main__":
o = GetFeeds()
o.main()
Currently, I am trying to export the information from both mac0de and malwaredomainlist to the same file called feeds_123.csv however, the csv file only shows malwaredomainlist items instead of showing both. I tried extracting it into 2 different files it works. May I know how can I solve this error and extract into the same file?

You need to open the file in "a" mode instead of "w" ,"a" means append and will add new content to the previous. "w" will clean the file and write over whatever was there.

Related

How can I edit my code to print out the content of my created json file?

My program takes a csv file as input and writes it as an output file in json format. On the final line, I use the print command to output the contents of the json format file to the screen. However, it does not print out the json file contents and I don't understand why.
Here is my code that I have so far:
import csv
import json
def jsonformat(infile,outfile):
contents = {}
csvfile = open(infile, 'r')
reader = csvfile.read()
for m in reader:
key = m['No']
contents[key] = m
jsonfile = open(outfile, 'w')
jsonfile.write(json.dumps(contents))
csvfile.close()
jsonfile.close()
return jsonfile
infile = 'orders.csv'
outfile = 'orders.json'
output = jsonformat(infile,outfile)
print(output)
Your function returns the jsonfile variable, which is a file.
Try adding this:
jsonfile.close()
with open(outfile, 'r') as file:
return file.read()
Your function returns a file handle to the file jsonfile that you then print. Instead, return the contents that you wrote to that file. Since you opened the file in w mode, any previous contents are removed before writing the new contents, so the contents of your file are going to be whatever you just wrote to it.
In your function, do:
def jsonformat(infile,outfile):
...
# Instead of this:
# jsonfile.write(json.dumps(contents))
# do this:
json_contents = json.dumps(contents, indent=4) # indent=4 to pretty-print
jsonfile.write(json_contents)
...
return json_contents
Aside from that, you aren't reading the CSV file the correct way. If your file has a header, you can use csv.DictReader to read each row as a dictionary. Then, you'll be able to use for m in reader: key = m['No']. Change reader = csvfile.read() to reader = csv.DictReader(csvfile)
As of now, reader is a string that contains all the contents of your file. for m in reader makes m each character in this string, and you cannot access the "No" key on a character.
a_file = open("sample.json", "r")
a_json = json.load(a_file)
pretty_json = json.dumps(a_json, indent=4)
a_file.close()
print(pretty_json)
Using this sample to print the contents of your json file. Have a good day.

Python write a list into csv

I am very new to python.
I have a list of stock names in a csv. I extract the names and put it before a website domain to create urls. I am trying to write the urls I created into another csv, but it only writes the last one out of the list. I want it to write all of the url into the csv.
with open('names.csv', 'r') as datafile:
for line in datafile:
domain = f'https://ceo.ca/{line}'
urls_link = (domain.strip())
print(urls_link)
y = open("url.csv","w")
y.writelines(urls_link)
y.close()
names.csv: https://i.stack.imgur.com/WrrLw.png
url.csv: https://i.stack.imgur.com/BYEgN.png
I would want the url csv look like this:
https://i.stack.imgur.com/y4xre.png
I apologise if I worded some things horribly.
You can use csv module in python
Try using this code:
from csv import writer,reader
in_FILE = "names.csv"
out_FILE = 'url.csv'
urls = list()
with open(in_FILE, 'r') as infile:
read = reader(infile, delimiter=",")
for domain_row in read:
for domain in domain_row:
url = f'https://ceo.ca/{domain.strip()}'
urls.append(url)
with open(out_FILE, 'w') as outfile:
write = writer(outfile)
for url in urls:
write.writerow([url])

I have converted a pdf file to csv using anaconda python3 But the converted csv file is not in a readable form how to make it readable?

# importing required modules
import PyPDF2
# creating a pdf file object
pdfFileObj = open(path, 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
# creating a page object
pageObj = pdfReader.getPage(0)
# extracting text from page
print(pageObj.extractText())
df = pd.DataFrame(pdfFileObj)
print (df)
df.to_csv('output.csv')
I have converted a pdf file to csv using anaconda python 3. But the converted csv file is not in a readable form. how to make that csv in readable format?
I tested your method and I couldn't find a way to correct the csv ouput. I useally do it this way:
import csv
import os
from miner_text_generator import extract_text_by_page
def export_as_csv(pdf_path, csv_path):
filename = os.path.splitext(os.path.basename(pdf_path))[0]
counter = 1
with open(csv_path, 'w') as csv_file:
writer = csv.writer(csv_file)
for page in extract_text_by_page(pdf_path):
text = page[0:100]
words = text.split()
writer.writerow(words)
if __name__ == '__main__':
pdf_path = '<your path to the file>.pdf'
csv_path = '<path to the output>.csv'
export_as_csv(pdf_path, csv_path)

Writing Printed Output to CSV - Numpy

I want this output written via CSV
['https://www.lendingclub.com/loans/personal-loans' '6.16% to 35.89%']
['https://www.lendingclub.com/loans/personal-loans' '1% to 6%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.discover.com/personal-loans/' '6.99% to 24.99%']
However when I run the code to write the output to CSV I only get the last line written to the CSV file:
['https://www.discover.com/personal-loans/' '6.99% to 24.99%']
Could it be because my printed output is not comma separated? I attempted to circumvent having to put a comma in there by using a space as the delimiter. Let me know your thoughts. Would love some help on this because I am having the hardest time reshaping this collected data.
plcompetitors = ['https://www.lendingclub.com/loans/personal-loans',
'https://www.marcus.com/us/en/personal-loans',
'https://www.discover.com/personal-loans/']
#cycle through links in array until it finds APR rates/fixed or variable using regex
for link in plcompetitors:
cdate = datetime.date.today()
l = r.get(link)
l.encoding = 'utf-8'
data = l.text
soup = bs(data, 'html.parser')
#captures Discover's rate perfectly but catches too much for lightstream/prosper
paragraph = soup.find_all(text=re.compile('[0-9]%'))
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
try:
irate = str(matches[0])
array = np.asarray(irate)
array2 = np.append(link,irate)
array2 = np.asarray(array2)
print(array2)
#with open('test.csv', "w") as csv_file:
# writer = csv.writer(csv_file, delimiter=' ')
# for line in test:
# writer.writerow(line)
except IndexError:
pass
When it comes to using csv file, pandas comes handy.
import datetime
import requests as r
from bs4 import BeautifulSoup as bs
import numpy as np
import regex as re
import pandas as pd
plcompetitors = ['https://www.lendingclub.com/loans/personal-loans',
'https://www.marcus.com/us/en/personal-loans',
'https://www.discover.com/personal-loans/']
df = pd.DataFrame({'Link':[],'APR Rate':[]})
#cycle through links in array until it finds APR rates/fixed or variable using regex
for link in plcompetitors:
cdate = datetime.date.today()
l = r.get(link)
l.encoding = 'utf-8'
data = l.text
soup = bs(data, 'html.parser')
#captures Discover's rate perfectly but catches too much for lightstream/prosper
paragraph = soup.find_all(text=re.compile('[0-9]%'))
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
irate = ''
try:
irate = str(matches[0])
df2 = pd.DataFrame({'Link':[link],'APR Rate':[irate]})
df = pd.concat([df,df2],join="inner")
except IndexError:
pass
df.to_csv('CSV_File.csv',index=False)
I have stored each link and it's irate value in a data frame df2 and I concatenate it to parent data frame df.
At the end, I write parent data frame df to a csv file.
I think the problem is that you are opening the file in write-mode (the "w" in open('test.csv', "w")), meaning that Python overwrites what's already written in the file. I think you're looking for append-mode:
# open the file before the loop, and close it after
csv_file = open("test.csv", 'a') # change the 'w' to an 'a'
csv_file.truncate(0) # clear the contents of the file
writer = csv.writer(csv_file, delimiter=' ') # make the writer beforehand for efficiency
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
try:
irate = str(matches[0])
array = np.asarray(irate)
array2 = np.append(link,irate)
array2 = np.asarray(array2)
print(array2)
for line in test:
writer.writerow(line)
except IndexError:
pass
# close the file
csv_file.close()
If this doesn't work, please let me know!

how to add lists to a dictionary then output to .csv

I'm try to iterate through tables in html by a searchlabel, then update the found value to a dictionary, then write those values to a csv. The output currently works for both the url and the headline, but the name output will either be blank or show "None." If i print the output of blog["name'] however, it is correctly pulling the information I want. I suspect that it's an indentation error but I can't figure out where to line things up. I've tried moving things around but nothing seems to work to get the name assignment to work inside that loop.
import os
from bs4 import BeautifulSoup
import my_csv_writer
def td_finder(tr, searchLabel):
value = ""
index = tr.text.find(searchLabel)
if index>-1:
tds = tr.findAll('td')
if len(tds)>1:
value = tds[1].text
return value
def main():
topdir = 'some_directory'
writer = my_csv_writer.CsvWriter("output.csv")
writer.writeLine(["url", "headline", "name"])
"""Main Function"""
blog = []
for root, dirs, files in os.walk(topdir):
for f in files:
url = os.path.join(root, f)
url = os.path.dirname(url).split('some_file')[1]
if f.lower().endswith((".html")):
file_new = open(os.path.join(root, f), "r").read()
soup = BeautifulSoup(file_new)
blog = {}
#Blog Title
blog["title"] = soup.find('title').text
for table in soup.findAll("table"):
for tr in table.findAll("tr"):
#name
blog["name"] = td_finder(tr, "name:")
seq = [url, unicode(blog["title"]), unicode(blog.get("name"))]
writer.writeLine(seq)
#return ""
if __name__ == '__main__':
main()
print "Finished main"
You're writing unicode strings to a csv file which according to the official docs "The csv module doesn’t directly support reading and writing Unicode...".
It does offer alternative classes to enable different encodings via UnicodeWriter. The following answer from Boud on SO highlights the need to set the desired encoding in the CSV file.

Categories