Select an item in csv based on value of second column - python

I'm trying to copy the files to another folder based on my csv. My csv contains the list of files in my folder. But how can I filter it based on the second column? For example copy only this file if the second column contains "undetected".
Here is my code but I dont know how to filter the files. This copies all my files to another folder.
import os
import shutil
import csv
valid_files = set() # empty set
with open('sha1_vsdt.csv', 'r') as f:
for rowDict in csv.reader(f, delimiter=','):
valid_files |= {rowDict[0] and "Undetected" in rowDict [2] } # add file name to set
print(rowDict) # if desired
dir_src = 'C:\Users\Administrator\Desktop\OJT\scanner\\samples_extracted'
dir_dst = 'C:\Users\Administrator\Desktop\OJT\scanner\\transfer'
for file in os.listdir(dir_src):
if file in valid_files:
src_file = os.path.join(dir_src, file)
dst_file = os.path.join(dir_dst, file)
shutil.copy(src_file, dst_file)
How do I correct this line?
valid_files |= {rowDict[0] and "Undetected" in rowDict [2] } # add file name to set
Example entries of my csv
0191a23ee122bdb0c69008971e365ec530bf03f5,aaa,MIME 6010-0
02b809d4edee752d9286677ea30e8a76114aa324,bbb,Microsoft RTF 6008-0
0349e0101d8458b6d05860fbee2b4a6d7fa2038d,ccc,Adobe Portable Document Format(PDF) 6015-0
035a7afca8b72cf1c05f6062814836ee31091559,ddd,Adobe Portable Document Format(PDF) 6015-0
042065bec5a655f3daec1442addf5acb8f1aa824,eee,Undetected
04939e040d9e85f84d2e2eb28343d94a50ed46ac,fff,Undetected

An if should be all that is required:
for rowDict in csv.reader(f, delimiter=','):
if "Undetected" in rowDict[2]:
valid_files.add(rowDict[0])

Related

How to use elements in list by order

My goal is to change multiple csv files in a folder into JSON.
First, I needed to list my csv files
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
#check if csv files are listed correctly
print(os.path.join("C:/Users/folder_to_csv", filename))
With this, I was able to call csv files in that folder.
Result:
C:/Users/folder_to_csv\file_1.csv C:/Users/folder_to_csv\file_2.csv C:/Users/folder_to_csv\file_3.csv
Then, I wanted to use all of the csv files in 'csvlist' to jsonObj, however for some reason, my codes are only using the first file (C:/Users/folder_to_csv\file_1.csv)
This is what I have tried so far:
import json
import csv
import requests
import threading
import os
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
csvlist = os.path.join("C:/Users/folder_to_csv", filename)
data = {}
def main():
#loop csv list so my codes can read all csv files
length = len(csvlist)
for i in range(length):
i += 1
path = csvlist
#switch csv to json
with open(path, mode='r') as f:
reader = csv.DictReader(f)
processdata = [row for row in reader]
dlist = processdata
jsonObj = json.dumps(dlist)
})
print(jsonObj)
main()
In the initial loop, you keep redefining the csvlist variable. I suppose you want it to be a list? Then just create an initial empty list and append to it instead of redefining
csvlist = []
...
csvlist.append(os.path.join("C:/Users/folder_to_csv", filename))

Move files listed in csv file?

I have been trying to use the following code to move files that are listed in a csv list. But at most it will copy the last file in the list but not the rest.
I keep hitting this wall with every example I have seen listed what am I doing wrong?
My CVS list will have a list like:
12355,12355.jpg
Here's my code
import os
import shutil
import csv
keys={}
with open('shuttle_image.csv', 'r') as f:
reader = csv.reader(f, delimiter = ',')
for rowDict in reader:
keys[rowDict[0]] = rowDict[1]
print (rowDict)
dir_src = 'C:\\Users\\Willie\\Desktop\\Suppliers Dropship\\hunting\\'
dir_dst = 'C:\\image\\'
for file in os.listdir(dir_src):
src_file = os.path.join(dir_src, file)
dst_file = os.path.join(dir_dst, file)
if file in rowDict[1]:
shutil.move(src_file, dst_file)
I think doing something like this will work (untested):
import os
import shutil
import csv
keys={}
with open('shuttle_image.csv', 'r') as f:
reader = csv.reader(f, delimiter=',')
for rowDict in reader:
keys[rowDict[0]] = rowDict[1]
print(rowDict) # if desired
valid_files = set(keys.values()) # file names found in csv
dir_src = 'C:\\Users\\Willie\\Desktop\\Suppliers Dropship\\hunting\\'
dir_dst = 'C:\\image\\'
for file in os.listdir(dir_src):
if file in valid_files:
src_file = os.path.join(dir_src, file)
dst_file = os.path.join(dir_dst, file)
shutil.move(src_file, dst_file)
As an optimization, unless you need the keys dictionary for other processing, you could change the first part so it just creates the valid_files set variable used in the second for loop:
valid_files = set() # empty set
with open('shuttle_image.csv', 'r') as f:
for rowDict in csv.reader(f, delimiter=','):
valid_files |= {rowDict[1]} # add file name to set
print(rowDict) # if desired
The reason why it's only the last file that could be copied (if it was) is because in this line:
if file in rowDict[1]:
you are referencing rowDict outside of the first for-loop. So at that execution moment, it contains the last value of this loop.
If I understand correctly what you are trying to do you could try something like this (untested code):
import os
import shutil
import csv
dir_src = 'C:\\Users\\Willie\\Desktop\\Suppliers Dropship\\hunting\\'
dir_dst = 'C:\\image\\'
with open('shuttle_image.csv', 'r') as f:
reader = csv.reader(f, delimiter = ',')
for rowDict in reader:
id, filename = rowDict
src_file = os.path.join(dir_src, filename)
if os.path.exists(src_file):
shutil.move(src_file, dir_dst)
So instead of:
Constructing a dictionary with all the values in your CSV file
Somehow check for every file in your source directory that it is included in your dictionary (which is what I interpreted you were trying to do)
And move it if it does.
You could:
For every file extracted from your CSV, check that it exists in your source directory.
If it does, you move it to the destination directory.
Is that what you were trying to do ?
[And if the filename stays the same, you only need to specify the destination directory for the second argument of shutil.move()]

grab headers from multiple tsv/csv files

I have a list of tsv files where I am looking to grab column headers for all the files.
with open(os.path.abspath('reference/file.tsv'), 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
Currently, this snippet only reads 1 file where I have a list of files that needs to be parsed.
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
The name of the files are listed in files. How do I loop through the list of files and grab only the column headers for each file?
I try this and it works.
import os
import csv
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for f in files:
with open(dir_path +'/'+f, 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
print row1
The files variable in your code is the content of the reference folder, meaning all files and subfolders of the folder. They are returned in a list of strings, containing only the file or subfolder name. This means that you'll have to prefix the path yourself.
Example:
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for file in files:
# Skip non-files
if not os.path.isfile(file):
continue
with open(os.path.join(dir_path, file), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
An alternative using the pathlib module:
for file in Path('reference/').glob('*'):
if not file.is_file():
continue
with open(str(file.resolve()), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
Wouldn't you be better off in reading the first line of each of those files, appending them to a list and then passing them to csvreader?
Example:
lines = []
with open(str(file.resolve()), 'rU') as f:
lines.append(f.readline())
reader = csv.reader(lines)
for row in reader:
# whatever you want to do with the parsed lines

Merge files into xlsx and then reconstruct the dir

I have many files ('*.pl-pl'). My script has to find each of this files and merge them into one xlsx file using openpyxl.
Now, I want to rebuild those files, I want rebuild the same files as originals.
But there is a problem after writing:
(content variable contains content of one file (read from one excel cell))
with open(path,'w') as f:
f.write(content.encode('utf-8'))
So now, I check, whether original files are the same as new files. Text in those files seems to be the same but there are little differencies in size. When I use WinDiff application to check them, it finds some touples which are different but it says that they are different in blanks only.
Could you give me an advice how to rebuild those files to be the same as before?
Or is this way correct?
Note: I try to rebuild them to be sure that there will be the same encoding etc. because the merged excel file will be used to translation and then translated files has to be rebuilt instead of originals.
Here is the code - it checks directory and prints all file names and contents into the one temporary file. Then, it creates an excel file - 1st. column is path (to be able reconstruct dir) and 2nd column contains content of the file, where new lines has been switched to '='
def print_to_file():
import os
for root, dirs, files in os.walk("OriginalDir"):
for file in files:
text = []
if file.endswith(".pl-pl"):
abs_path = os.path.join(root, file)
with open(abs_path) as f:
for line in f:
text.append(line.strip('\n'))
mLib.printToFile('files.mdoc', abs_path + '::' + '*=*'.join(text)) #'*=*' represents '\n'
def write_it():
from openpyxl import Workbook
import xlsxwriter
file = 'files.mdoc'
workbook = Workbook()
worksheet = workbook.worksheets[0]
worksheet.title = "Translate"
i = 0
with open(file) as f:
classes = set()
for line in f:
i += 1
splitted = line.strip('\n').split('::')
name = splitted[0]
text = splitted[1].split('*=*')
text = [x.encode('string-escape') for x in text]
worksheet.cell('B{}'.format(i)).style.alignment.wrap_text = True
worksheet.cell('B{}'.format(i)).value = splitted[1]
worksheet.cell('A{}'.format(i)).value = splitted[0]
workbook.save('wrap_text1.xlsx')
import openpyxl
def rebuild():
wb = openpyxl.load_workbook('wrap_text1.xlsx')
ws = wb.worksheets[0]
row_count = ws.get_highest_row()
for i in xrange(1, row_count + 1):
dir_file = ws.cell('A{}'.format(i)).value
content = ws.cell('B{}'.format(i)).value
remake(dir_file, content)
import os
def remake(path, content):
content = re.sub('\*=\*', '\n', content)
result = ''
splt = path.split('\\')
file = splt[-1]
for dir in splt[:-1]:
result += dir + '/'
# print result
if not os.path.isdir(result):
# print result
os.mkdir(result)
with open(path, 'w') as f:
f.write(content.encode('utf-8'))
# print_to_file() # print to temp file - paths and contents separated by '::'
# write_it() # write it into the excel file
# rebuilt() # reconstruct directory

Copy certain files from one folder to another using python

I am trying to copy only certain files from one folder to another. The filenames are in a attribute table of a shapefile.
I am successful upto writing the filenames into a .csv file and list the column containing the list of the filenames to be transferred. I am stuck after that on how to read those filenames to copy them to another folder. I have read about using Shutil.copy/move but not sure how to use it. Any help is appreciated. Below is my script:
import arcpy
import csv
import os
import sys
import os.path
import shutil
from collections import defaultdict
fc = 'C:\\work_Data\\Export_Output.shp'
CSVFile = 'C:\\wokk_Data\\Export_Output.csv'
src = 'C:\\UC_Training_Areas'
dst = 'C:\\MOSAIC_Files'
fields = [f.name for f in arcpy.ListFields(fc)]
if f.type <> 'Geometry':
for i,f in enumerate(fields):
if f in (['FID', "Area", 'Category', 'SHAPE_Area']):
fields.remove (f)
with open(CSVFile, 'w') as f:
f.write(','.join(fields)+'\n')
with arcpy.da.SearchCursor(fc, fields) as cursor:
for row in cursor:
f.write(','.join([str(r) for r in row])+'\n')
f.close()
columns = defaultdict(list)
with open(CSVFile) as f:
reader = csv.DictReader(f)
for row in reader:
for (k,v) in row.items():
columns[k].append(v)
print(columns['label'])
Given the name of the file
columns['label'] you can use the following to move a file
srcpath = os.path.join(src, columns['label'])
dstpath = os.path.join(dst, columns['label'])
shutil.copyfile(srcpath, dstpath)
Here is the script I used to solve my problem:
import os
import arcpy
import os.path
import shutil
featureclass = "C:\\work_Data\\Export_Output.shp"
src = "C:\\Data\\UC_Training_Areas"
dst = "C:\\Data\\Script"
rows = arcpy.SearchCursor(featureclass)
row = rows.next()
while row:
print row.Label
shutil.move(os.path.join(src,str(row.Label)),dst)
row = rows.next()
Think of it this ways way source and destination
assuming you want to copy file from your picture folder to your image folder located somewhere in your machine destination
X is your machine name
Z is the file name``
import os;
import shutil;
import glob;
source="C:/Users/X/Pictures/test/Z.jpg"
dest="C:/Users/Public/Image"
if os.path.exists(dest):
print("this folder exit in this dir")
else:
dir = os.mkdir(dest)
for file in glob._iglob(os.path.join(source),""):
shutil.copy(file,dest)
print("done")

Categories