How can I export my scraped data into a CSV file. My code below prints out all of the data correctly, but I would like to export it the same way to a CSV file, line by line.
How can I write the data to a csv file?
import requests
import json
import csv
with open('Links.csv', 'r') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in readCSV:
data = row[0]
for b in row:
r = requests.get(b)
json_object = json.loads('{"data":%s}}' % (r.content.decode("utf-8").replace("jQuery111002521088376353553_1491736907010(", "")[:-2].replace("\'", "")))
for game in json_object["data"]["docs"]:
print ("Name: %s, Price: %s, CatalogId: %s, slug: %s" % (game["name"], game["minPrice"], game["catalogId"], game["slug"]))
You can write the data to a row in a csv file like:
writeCSV.writerow([game["name"], game["minPrice"], game["catalogId"], game["slug"]])
Here is this added to your code, with the init code needed:
import csv
import json
import requests
with open('Links.csv', 'r') as r_csvfile, open('outp.csv', 'w') as w_csvfile:
readCSV = csv.reader(r_csvfile, delimiter=',', quotechar='"')
writeCSV = csv.writer(w_csvfile, delimiter=',', quotechar='"')
writeCSV.writerow("Name Price CatalogId slug".split())
for row in readCSV:
data = row[0]
for b in row:
r = requests.get(b)
json_object = json.loads('{"data":%s}}' % (
r.content.decode("utf-8").replace(
"jQuery111002521088376353553_1491736907010(", "")[:-2]
.replace("\'", "")))
for game in json_object["data"]["docs"]:
writeCSV.writerow([game["name"], game["minPrice"],
game["catalogId"], game["slug"]])
You did not give any actual data, so I could not test this, but it will be close.
I think pandas is the package you‘re looking for.
Use pandas.dataframe.from_dict or pandas.dataframe.from_json. Once you have your pandas dataframe writing a csv file is as easy as it gets.
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
Related
im trying to get the value of the first and the thirs row in a csv file.
my approach gives me the first and the 3rd character of the first row. instead of the fields in row 1 and 3. Would be great if someone could give me a tipp what im doing wrong!
lang_tags = []
tweets = []
#open and read csv file
with open("tweet-corpus.csv", "r") as csv_file:
reader = csv.DictReader(csv_file)
for row in csv_file:
lang_tags = row[0]
tweets = row[2]
for lan in lang_tags:
print("lang: ", lang_tags)
print("tweet: ", tweets)
Use the csv reader object.
Ex:
with open("tweet-corpus.csv", "r") as csv_file:
reader = csv.reader(csv_file)
for row in reader:
lang_tags = row[0]
or
with open("tweet-corpus.csv", "r") as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
lang_tags = row['YOURCOL_NAME']
tweets = row['YOURCOL_NAME']
If your data looks anything remotely like:
col_name0, col_name1, col_name2, ...
value0, value1, value2, ...
value0, value1, value2, ...
I recommend using pandas.
import pandas as pd # by convention, we always import pandas as pd
df = pd.read_csv(filename)
column = df[column_name]
I am able to change the data to lowercase and remove all the punctuation but I have trouble saving the corrected data in CSV file.
import csv
import re
import os
input_file=raw_input("Name of the CSV file:")
output_file=raw_input("Output Name:")
reg_test=input_file
result = ''
with open(input_file,'r') as csvfile:
with open(output_file,'w') as csv_out_file:
filereader = csv.reader(csvfile)
filewriter =csv.writer(csv_out_file)
for row in filereader:
row = re.sub('[^A-Za-z0-9]+', '', str(row))
result += row + ','
lower = (result).lower()
csvfile.close()
csv_out_file.close()
You do not have to close the files, this is done automatically after the context of the with statement is over and you have to actually write something after you create the csv.writer, e.g. with writerow:
import csv
import re
input_file = 'in.csv'
output_file = 'out.csv'
with open(input_file, 'r') as csvfile, open(output_file, 'w') as csv_out_file:
filereader = csv.reader(csvfile)
filewriter = csv.writer(csv_out_file)
for row in filereader:
new_row = re.sub('[^A-Za-z0-9]+', '', str(row)) # manipulate the row
filewriter.writerow([new_row.lower()]) # write the new row to the out file
# the files are closed automatically after the context of the with statement is over
This saves the manipulated content of the first csv file to the second.
http://example.com/item/all-atv-quad.html,David,"Punjab",+123456789123
http://example.com/item/70cc-2014.html,Qubee,"Capital",+987654321987
http://example.com/item/quad-bike-zenith.html,Zenith,"UP",+123456789123
I have this test.csv where I have scraped a few items from certain site but the thing is "number" field has redundancy. So I somehow need to remove a row that has the same number as before. This is just the example file, In the real file some numbers are repeated more than 50+ times.
import csv
with open('test.csv', newline='') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
for column in csvreader:
"Some logic here"
if (column[3] == "+123456789123"):
print (column[0])
"or here"
I need reformated csv like this:
http://example.com/item/all-atv-quad.html,David,"Punjab",+123456789123
http://example.com/item/70cc-2014.html,Qubee,"Capital",+987654321987
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
def direct():
seen = set()
with open("test.csv") as infile, open("formatted.csv", 'w') as outfile:
for line in infile:
parts = line.rstrip().split(',')
number = parts[-1]
if number not in seen:
seen.add(number)
outfile.write(line)
def using_pandas():
"""Alternatively, use Pandas"""
df = pd.read_csv("test.csv", header=None)
df = df.drop_duplicates(subset=[3])
df.to_csv("formatted_pandas.csv", index=None, header=None)
def main():
direct()
using_pandas()
if __name__ == "__main__":
main()
This would filter out duplicates:
seen = set()
for line in csvreader:
if line[3] in seen:
continue
seen.add(line[3])
# write line to output file
And the csv read and write logic:
with open('test.csv') as fobj_in, open('test_clean.csv', 'w') as fobj_out:
csv_reader = csv.reader(fobj_in, delimiter=',')
csv_writer = csv.writer(fobj_out, delimiter=',')
seen = set()
for line in csvreader:
if line[3] in seen:
continue
seen.add(line[3])
csv_writer.writerow(line)
I am trying to connect to oracle table and execute a sql. I need to export result set to a csv file. My code is below:
import pyodbc
import csv
cnxn = pyodbc.connect("DSN=11g;UID=test101;PWD=passwd")
cursor = cnxn.cursor()
cursor.execute(sql)
row = cursor.fetchall()
with open('data.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
for line in row:
a.writerows(line)
cursor.close()
when I do print to line within for loop, I get something like this:
('Production', 'farm1', 'dc1prb01', 'web')
('Production', 'farv2', 'dc2pr2db01', 'app.3')
('Production', 'farm5', 'dc2pr2db02', 'db.3')
this is not working. Any ideas what I might be missing?
It would be writerow for a single row:
a.writerow(line)
writerows expects an iterable of iterables, so it will iterate over the substrings writing each char individually.
If you want to use writerows call it on row:
row = cursor.fetchall()
with open('data.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(row)
If you are using python2 remove newline='', newline is a *python*3 keyword:
row = cursor.fetchall()
with open('data.csv', 'w') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(row)
Some example data:
title1|title2|title3|title4|merge
test|data|here|and
test|data|343|AND
",3|data|343|and
My attempt at coding this:
import csv
import StringIO
storedoutput = StringIO.StringIO()
fields = ('title1', 'title2', 'title3', 'title4', 'merge')
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, fields, delimiter='|')
for counter, row in enumerate(reader):
counter += 1
#print row
if counter != 1:
for field in fields:
if field == "merge":
row['merge'] = ("%s%s%s" % (row["title1"], row["title3"], row["title4"]))
print row
storedoutput.writelines(','.join(map(str, row)) + '\n')
contents = storedoutput.getvalue()
storedoutput.close()
print "".join(contents)
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
output_csv = []
output_csv.append(contents.strip())
if "".join(output_csv) != input_csv:
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(output_csv))
Output should be
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
For your reference upon running this code the first print it prints the rows as I would hope then to appear in the output csv. However the second print prints the title row x times where x is the number of rows.
Any input or corrections or working code would be appreciated.
I think we can make this a lot simpler. Dealing with the rogue " was a bit of a nuisance, I admit, because you have to work hard to tell Python you don't want to worry about it.
import csv
with open('file.csv', 'rb') as input_csv, open("new_file.csv", "wb") as output_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(output_csv, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
produces
$ cat new_file.csv
title1|title2|title3|title4|merge
test|data|here|and|testhereand
test|data|343|AND|test343AND
",3|data|343|and|",3343and
Note that even though you wanted the original file updated, I refused. Why? It's a bad idea, because then you can destroy your data while working on it.
How can I be so sure? Because that's exactly what I did when I first ran your code, and I know better. ;^)
That double quote in the last line is definitely messing up the csv.DictReader().
This works:
new_lines = []
with open('file.csv', 'rb') as f:
# skip the first line
new_lines.append(f.next().strip())
for line in f:
# the newline and split the fields
line = line.strip().split('|')
# exctract the field data you want
title1, title3, title4 = line[0], line[2], line[3]
# turn the field data into a string and append in to the rest
line.append(''.join([title1, title3, title4]))
# save the new line for later
new_lines.append('|'.join(line))
with open('file.csv', 'w') as f:
# make one long string and write it to the new file
f.write('\n'.join(new_lines))
import csv
import StringIO
stored_output = StringIO.StringIO()
with open('file.csv', 'rb') as input_csv:
reader = csv.DictReader(input_csv, delimiter='|', quoting=csv.QUOTE_NONE)
writer = csv.DictWriter(stored_output, reader.fieldnames, delimiter="|",quoting=csv.QUOTE_NONE, quotechar=None)
merge_cols = "title1", "title3", "title4"
writer.writeheader()
for row in reader:
row["merge"] = ''.join(row[col] for col in merge_cols)
writer.writerow(row)
contents = stored_output.getvalue()
stored_output.close()
print contents
with open('file.csv', 'rb') as input_csv:
input_csv = input_csv.read().strip()
if input_csv != contents.strip():
with open('file.csv', 'wb') as new_csv:
new_csv.write("".join(contents))