Python Multithreading CSV Read Write Sort - python

I have a use case to read from a csv file (inputs.csv) and call an API endpoint for each row of the csv. And write the output into a different csv (outputs.csv) and sort the output csv by a particular column. I am able to achieve all that by the below code. Need to find out if I can do it more efficiently in a multi-threaded way.
def main():
start = time.time()
print "read from csv file "
input_file = 'inputs.csv'
output_file = 'outputs.csv'
read_write_csv(input_file, output_file)
print("after output-->",time.time() - start)
sort_csv(output_file)
print("after sort -->",time.time() - start)
def read_write_csv(input_file, output_file):
with open(input_file, 'r') as csv_file:
reader = csv.reader(csv_file)
# Reading row by row
count = 0
for row in reader:
# Opening csv result file in append mode
with open(output_file, "a+") as csv_save:
writer = csv.writer(csv_save)
print "mac address ", row[0]
writer.writerow([row[0], callExternalAPI(row[0])]),(row,))
print "{0} devices processed so far".format(count+1)
csv_save.close()
def sort_csv(output_file):
with open('sorted.csv', 'w') as csv_final:
r = csv.reader(open(output_file), delimiter=",")
writer_final = csv.writer(csv_final)
sortedResponse = sorted(r, key=operator.itemgetter(1), reverse=True)
for row in sortedResponse:
writer_final.writerow(row)
main()
As you can see, I am a python newbie here so any suggestion to improve the code is most welcome.

Related

How to remove specifc row from csv file using python

I am working on one program and trying to achieve following functionalities.
add new student
Remove student based on id
here is my code
from csv import writer
import csv
def add(file_name, list_of_elem):
# Open file in append mode
with open(file_name, 'a+', newline='') as write_obj:
# Create a writer object from csv module
csv_writer = writer(write_obj)
# Add contents of list as last row in the csv file
csv_writer.writerow(list_of_elem)
def remove():
id = input("Enter ID : ")
with open('students.csv', 'rb') as inp, open('students.csv', 'wb') as out:
writer = csv.writer(out)
for row in csv.reader(inp):
if row[0] != id:
writer.writerow(row)
# List of strings
row_contents = [11,'mayur','Java','Tokyo','Morning']
# Append a list as new line to an old csv file
add('students.csv', row_contents)
remove()
add function works properly but when i tried remove function it removes all existing entries.Could anyone please help me.
First I will show the code and below I will left some comments about the changes.
from csv import writer
import csv
def add(file_name, list_of_elem):
# Open file in append mode
with open(file_name, 'a+', newline = '') as write_obj:
# Create a writer object from csv module
csv_writer = writer(write_obj)
# Add contents of list as last row in the csv file
csv_writer.writerow(list_of_elem)
def remove():
idt = input("Enter ID : ")
with open('students.csv', 'r') as inp:
newrows = []
data = csv.reader(inp)
for row in data:
if row[0] != idt:
newrows.append(row)
with open('students.csv', 'w') as out:
csv_writer = writer(out)
for row in newrows:
csv_writer.writerow(row)
def display():
with open('students.csv','r') as f:
data = csv.reader(f)
for row in data:
print(row)
# List of strings
row_contents = [10,'mayur','Java','Tokyo','Morning']
add('students.csv', row_contents)
row_contents = [11,'mayur','Java','Tokyo','Morning']
add('students.csv', row_contents)
row_contents = [12,'mayur','Java','Tokyo','Morning']
add('students.csv', row_contents)
# Append a list as new line to an old csv file
display()
remove()
If your file is a CSV, you should use a text file, instead of a binary one.
I changed the name of the variable id to ìdt because id is built-in to return the identity of an object and it's not a good practice overwrite built-in functions.
To remove only rows with an specific idt you should read all the file, store into a var (list), remove what you want to delete and only after that save the result.
You should use a temporary file instead of opening and writing to the same file simultaneously. Checkout this answer: https://stackoverflow.com/a/17646958/14039323

How to extract one column to another file from a 300GB file

Problem was the huge data number, and I have to do it with my personal laptop with 12GB RAM. I tried a loop with 1M. lines every round, and used csv.writer. But csv.writer wrote like 1M. lines every two hours. So, any other ways worth to try?
lines = 10000000
for i in range(0, 330):
list_str = []
with open(file, 'r') as f:
line_flag = 0
for _ in range(i*lines):
next(f)
for line in f:
line_flag = line_flag + 1
data = json.loads(line)['name']
if data != former_str:
list_str.append(data)
former_str = data
if line_flag == lines:
break
with open(self.path + 'data_range\\names.csv', 'a', newline='') as writeFile:
writer = csv.writer(writeFile, delimiter='\n')
writer.writerow(list_str)
writeFile.close()
another version
def read_large_file(f):
block_size = 200000000
block = []
for line in f:
block.append(line[:-1])
if len(block) == block_size:
yield block
block = []
if block:
yield block
def split_files():
with open(write_file, 'r') as f:
i = 0
for block in read_large_file(f):
print(i)
file_name = write_name + str(i) + '.csv'
with open(file_name, 'w', newline='') as f_:
writer = csv.writer(f_, delimiter='\n')
writer.writerow(block)
i += 1
This was after it read a block and writing ... I wonder how come the rate of data trasmission was keeping about 0.
It should be as simple as this:
import json
import csv
with open(read_file, 'rt') as r, open(write_file, 'wt', newline='') as w:
writer = csv.writer(w)
for line in r:
writer.writerow([json.loads(line)['name']])
I tried the loop inside the file, but I always get me a Error, I guessed we cannot write the data into another file while opening the file?
You totally can write data in one file while reading another. I can't tell you more about your error until you post what it said, though.
There was a bit in your code about former_str which is not covered under "extract one column", so I did not write anything about it.
Would something like this work?
Essentially using a generator to avoid reading the entire file in memory, and writing the data one line at a time.
import jsonlines # pip install jsonlines
from typing import Generator
def gen_lines(file_path: str, col_name: str) -> Generator[str]:
with jsonline.open(file_path) as f:
for obj in f:
yield obj[col_name]
# Here you can also change to writing a jsonline again
with open(output_file, "w") as out:
for item in gen_lines(your_file_path, col_name_to_extract):
out.write(f"{item}\n")

How to read a CSV and adapt + write every row to another CSV?

I tried this but it just writes "lagerungskissen kleinkind,44" several times instead of transferring every row.
keyword = []
rank = []
rank = list(map(int, rank))
data = []
with open("keywords.csv", "r") as file:
for line in file:
data = line.strip().replace('"', '').split(",")
keyword = data[0]
rank = data[3]
import csv
with open("mynew.csv", "w", newline="") as f:
thewriter = csv.writer(f)
thewriter.writerow(["Keyword", "Rank"])
for row in keyword:
thewriter.writerow([keyword, rank])
It should look like this
This is writing the same line in your output CSV because the final block is
for row in keyword:
thewriter.writerow([keyword, rank])
Note that the keyword variable doesn't change in the loop, but the row does. You're writing that same [keyword, rank] line len(keyword) times.
I would use the csv package to do the reading and the writing for this. Something like
import csv
input_file = '../keywords.csv'
output_file = '../mynew.csv'
# open the files
fIn = open(input_file, 'r', newline='')
fOut = open(output_file, 'w')
csvIn = csv.reader(fIn, quotechar='"') # check the keyword args in the docs!
csvOut = csv.writer(fOut)
# write a header, then write each row one at a time
csvOut.writerow(['Keyword', 'Rank'])
for row in csvIn:
keyword = row[0]
rank = row[3]
csvOut.writerow([keyword, rank])
# and close the files
fOut.close()
fIn.close()
As as side note, you could write the above using the with context manager (e.g. with open(...) as file:). The answer here shows how to do it with multiple files (in this case fIn and fOut).

Repeat first line on whole text file using python

I am making a Twitter sentiment analysis. After cleaning the tweets, when I try to write tweets into .txt file from .csv it writes only first tweets in text file and repeats until the end. Consider the following code
f = open('PanamaCase.csv', 'r')
with f:
reader = csv.DictReader(f)
i=0
for row in reader:
row=str(row['Tweets'])
#print(type(row))
print(clean(row))
txt = open('cleanedTweets.txt','w')
#line = 0
with txt:
reader2 = csv.DictReader(f)
for line in reader2:
txt.write(clean(row) + "\n")
I think your problem is that you are reading the input file twice in your code (or actually 1 + once for each line).
I suggest to try:
f = open('PanamaCase.csv', 'r')
with f:
txt = open('cleanedTweets.txt','w')
with txt:
for row in reader:
row=str(row['Tweets'])
print(clean(row))
txt.write(clean(row) + "\n")

How to read list which contains comma from CSV file as a column?

I want to read CSV file which contains following data :
Input.csv-
10,[40000,1][50000,5][60000,14]
20,[40000,5][50000,2][60000,1][70000,1][80000,1][90000,1]
30,[60000,4]
40,[40000,5][50000,14]
I want to parse this CSV file and parse it row by row. But these lists contains commas ',' so I'm not getting correct result.
Program-Code-
if __name__ == "__main__":
with open(inputfile, "r") as f:
reader = csv.reader(f,skipinitialspace=True)
next(reader,None)
for read in reader:
no = read[0]
splitted_record = read[1]
print splitted_record
Output-
[40000
[40000
[60000
[40000
I can understand read.csv method reads till commas for each column. But how I can read whole lists as a one column?
Expected Output-
[40000,1][50000,5][60000,14]
[40000,5][50000,2][60000,1][70000,1][80000,1][90000,1]
[60000,4]
[40000,5][50000,14]
Writing stuff to other file-
name_list = ['no','splitted_record']
file_name = 'temp/'+ no +'.csv'
if not os.path.exists(file_name):
f = open(file_name, 'a')
writer = csv.DictWriter(f,delimiter=',',fieldnames=name_list)
writer.writeheader()
else:
f = open(file_name, 'a')
writer = csv.DictWriter(f,delimiter=',',fieldnames=name_list)
writer.writerow({'no':no,'splitted_record':splitted_record})
How I can write this splitted_record without quote ""?
you can join those items together, since you know it split by comma
if __name__ == "__main__":
with open(inputfile, "r") as f:
reader = csv.reader(f,skipinitialspace=True)
next(reader,None)
for read in reader:
no = read[0]
splitted_record = ','.join(read[1:])
print splitted_record
output
[40000,1][50000,5][60000,14]
[40000,5][50000,2][60000,1][70000,1][80000,1][90000,1]
[60000,4]
[40000,5][50000,14]
---update---
data is the above output
with open(filepath,'wb') as f:
w = csv.writer(f)
for line in data:
w.writerow([line])
You can use your own dialect and register it to read as you need.
https://docs.python.org/2/library/csv.html

Categories