Removing Duplicate CSV Entries with python

Removing Duplicate CSV Entries with python - python

I have just completed a script that (sigh) finally works. It searches twitter for keywords. The results are written to a csv with 4 columns of keyword, Tweet, Lat, Lon (location). The code that I'm using is:
import tweepy
import csv
keywordList = ['McDonalds', 'Taco Bell', 'Burger King',]
for keyword in keywordList:
result = tweepy.api.search(q=keyword,rpp=1000,page=2, geocode= "34.085422,-117.900879,500mi" )
with open(r'C:\Temp\results.csv', 'a') as acsv:
w = csv.writer(acsv)
for tweet in result:
lat, lon = tweet.geo if tweet.geo else ('', '')
try:
a = tweet.geo['coordinates']
print a[0] , a[1]
print tweet.text
w.writerow((keyword, tweet.text, a[0] , a[1]))
except:
pass
I want to use task manager or python to run this search every 5 minutes but It will rewrite duplicates. I was going to use the following code to remove duplicates but two things happen. The resutls2.csv is blank and when I go to open the csv, it is locked and I have to view it in a read only. I tried f1.close(), writer.close() etc but it says 'csv.reader' object has no attribute close.
My biggest concern is getting no duplicates either by writing to the new csv or somehow removing and writing to the same table on each search. Any suggestions are much appreciated!!
import csv
f1 = csv.reader(open(r'C:\Temp\results.csv', 'rb'))
writer = csv.writer(open(r'C:\Temp\results2.csv', 'wb'))
tweet = set()
for row in f1:
if row[1] not in tweet:
writer.writerow(row)
tweet.add( row[1] )
f1.close()
writer.close()

Here's a refactored version:
Edit: unicode, what fun - I've added a .decode() call in read_csv() and an .encode() call in append_csv(); this should solve your problem (I think - you might need to decide on a string codec).
import tweepy
import csv
from collections import defaultdict
import time
FILE = 'c:/temp/results.csv'
KEYWORDS = ['McDonalds', 'Taco Bell', 'Burger King']
WHERE = "34.085422,-117.900879,500mi"
DELAY = 300 # seconds
def _float(s, err=None):
try:
return float(s)
except ValueError:
return err
def _str(f, err=""):
return err if f is None else str(f)
def read_csv(fname=FILE):
data = defaultdict(dict)
with open(fname, 'rb') as inf:
incsv = csv.reader(inf)
for kw,tw,lat,lon in incsv:
# added .decode() call to handle saved unicode chars
data[kw][tw.decode()] = (_float(lat), _float(lon))
return data
def append_csv(data, fname=FILE):
with open(fname, "ab") as outf:
outcsv = csv.writer(outf)
# added .encode() call to handle saved unicode chars
outcsv.writerows((kw,tw.encode(),_str(lat),_str(lon)) for kw,dat in data.iteritems() for tw,(lat,lon) in dat.iteritems())
def search_twitter(keywords=KEYWORDS, loc=WHERE):
data = defaultdict(dict)
for kw in keywords:
for tweet in tweepy.api.search(q=kw, rpp=1000, page=2, geocode=loc):
data[kw][tweet.text] = tweet.geo if tweet.geo else (None,None)
return data
def calc_update(old_data, new_data):
diff = defaultdict(dict)
for kw,dat in new_data.iteritems():
for tw,loc in dat.iteritems():
if tw not in old_data[kw]:
diff[kw][tw] = old_data[kw][tw] = loc
return old_data, diff
def show_data(data):
for kw,dat in data.iteritems():
for tw,(lat,lon) in dat.iteritems():
print("<{},{}> {} [{}]".format(_str(lat,"???"), _str(lon,"???"), tw, kw))
def main():
data = read_csv()
while True:
new_data = search_twitter()
data,diff = calc_update(data, new_data)
append_csv(diff)
show_data(diff)
time.sleep(DELAY)
if __name__=="__main__":
main()

Related

Parsing csv file and splitting into sub files

I am trying to create a generic filter to split file on the condition from the Yaml file.
My code is running Pandas but as the environment is not having Pandas module I am trying to achieve it through CSV library.
When I am hard coding the value at q its working but when I am trying to pass it from the config file its not working. Also I want pass multiple checks on the same column like('','Balance). So Asset goest to one file and ('','Balance) in another.
import sys
import yaml
import csv
def dynamicQuery(config_file, data_file, outputPath):
"""Loading Configuration file into dataframe"""
try:
with open(config_file) as file:
doc = yaml.full_load(file)
except Exception as err:
print("Error Configuration data file: ", err)
try:
for k, v in doc.items():
if k != 'column':
filename = k
k = doc[k]
q = ' , '.join(f'{v} ' for q, v in k.items())
q = '"' + str(strip(q)) + '"'
print(q) #-- "Asset"
df = csv.reader(open(data_file), delimiter=',')
df = filter(lambda x: (x[2] == q), df) # Not working here
#df = filter(lambda x: x[2] == "Asset", df) --> this is working
csv.writer(open(filename + ".txt", 'w', newline=' '), delimiter=',').writerows(df)
print("File is created for " + filename)
except Exception as err:
print("Error executing queries and saving output data file: ", err)
def main():
if len(sys.argv) == 3:
"""File will be passed as parameter """
config_file = sys.argv[1]
data_file = sys.argv[2]
dynamicQuery(config_file, data_file)
else:
usage()
def usage():
print("Usage: python splitGenric.py config_file data_file ")
main()
Sample file
1233,ACV,Asset,sample
1235,ACV,Asset,sample
1232,ACV,Asset,sample
1234,ACV,Asset,sample
1237,ACV,,sample
1238,ACV,,sample
1234,ACV,Balance,sample
1254,ACV,Balance,sample
1244,ACV,Balance,sample
1264,ACV,Balance,sample
Config.yaml
Asset :
filter1: '"Asset"'
Balance:
filter1: '"Balance"'
filter2: '""'

The YAML configuration file format is not particularly convenient for this, and yaml is not a standard Python module. I would probably go for something like regular expressions instead of a YAML file. But just to sort out the immediate problems, the problem here is that you are mixing up Python syntax and literal quoting characters. You are assembling a string containing literal double quotes around Asset for example, where your CSV file does not contain double quotes around this value; and so you are effectively comparing if 'Asset' == '"Asset"' which of course is False.
The following might not do exactly what you want, but should at least demonstrate a rough first cut of what I think you are trying to do here.
with open(config_file) as file:
config = yaml.full_load(file)
filters = dict()
for k, v in config.items():
handle = open(k + '.txt', 'w', newline='')
writer = csv.writer(handle, delimiter=',')
filt = {'handle': handle, 'writer': writer, 'conditions': []}
for _, expr in v.items():
filt['conditions'].append(expr.strip('"'))
filters[k] = filt
with open(data_file) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
for handle, conf in filters.items():
for i in range(len(conf['conditions'])):
if row[2] == conf['conditions'][i]:
conf['writer'].writerow(row)
break
for handle, conf in filters.items():
conf['handle'].close()
I'm guessing you used pyyaml which seems to be the dominant YAML module for Python.

I tried to use the config.yaml, but I've got this error
File "C:\Users\XXXXXX\AppData\Local\Programs\Python\Python36-32\lib\site-packages\yaml\parser.py", line 439, in parse_block_mapping_key
"expected <block end>, but found %r" % token.id, token.start_mark)
yaml.parser.ParserError: while parsing a block mapping
in "config.yml", line 5, column 5
expected <block end>, but found ','
in "config.yml", line 5, column 17
But I will pretend it worked and the content was loaded in a dictionary, as it appears to be the intention.
The dictionary is as:
doc = {'Asset':'Asset','Balance':[' ','Balance']}
#load directly to dataframe
df = pd.read_csv('sample.txt',header=None)
handler = ''
for k,v in doc.items():
kList = {k:[]} #making empty lists with k values
if isinstance(v,str): #Asset is string
fil = v
else:
for i in range(len(v)): #Balance is list of values
if v[i]:
fil = v[i]
else:
handler = k #replace the null
for types in df.values:
if fil in types:
kList[k].append(types) #append types to corresponding list
csv.writer(open(k+".txt", 'a', newline='\n'), delimiter=',').writerows(kList[k])
if handler: #there is null values
nulls = df[df.isnull().any(axis=1)].values.tolist()
csv.writer(open(handler+".txt", 'a', newline='\n'), delimiter=',').writerows(nulls)
The result are two files, with the following contents:
Asset.txt:
1233,ACV,Asset,sample
1235,ACV,Asset,sample
1232,ACV,Asset,sample
1234,ACV,Asset,sample
Balance.txt:
1234,ACV,Balance,sample
1254,ACV,Balance,sample
1244,ACV,Balance,sample
1264,ACV,Balance,sample
1237,ACV,nan,sample
1238,ACV,nan,sample

extract record by csv and filtering by date

I have a csv file where each record is a LinkedIn contact. I have to recreate another csv file where each contact it was reached only after a specific date (ex all the contact that are connected to me after 1/04/2017).
So this is my implementation:
def import_from_csv(file):
key_order = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
linkedin_contacts = []
with open(file, encoding="utf8") as csvfile:
reader=csv.DictReader(csvfile, delimiter=',')
for row in reader:
single_person = {"FirstName": row["FirstName"], "LastName": row["LastName"],
"EmailAddress": row["EmailAddress"], "Company": row["Company"],
"ConnectedOn": parser.parse(row["ConnectedOn"])}
od = OrderedDict((k, single_person[k]) for k in key_order)
linkedin_contacts.append(od)
return linkedin_contacts
the first script give to me a list of ordered dict, i dont know if the way i used to achive the correct order is good, also seeing some example (like here) i'm not using the od.update method, but i dont think i need it, is it correct?
Now i wrote a second function to filter the list:
def filter_by_date(connections):
filtered_list = []
target_date = parser.parse("01/04/2017")
for row in connections:
if row["ConnectedOn"] > target_date:
filtered_list.append(row)
return filtered_list
Am I doing this correctly?
Is there a way to optimize the code? Thanks

First point: you don't need the OrderedDict at all, just use a csv.DictWriter to write the filtered csv.
fieldnames = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
with open("/apth/to/final.csv", "wb") as f:
writer = csv.DictWriter(f, fieldnames)
writer.writeheader()
writer.writerows(filtered_contacts)
Second point: you don't need to create a new dict from the one yielded by the csv reader, just update the ConnectedOn key in place :
def import_from_csv(file):
linkedin_contacts = []
with open(file, encoding="utf8") as csvfile:
reader=csv.DictReader(csvfile, delimiter=',')
for row in reader:
row["ConnectedOn"] = parser.parse(row["ConnectedOn"])
linkedin_contacts.append(row)
return linkedin_contacts
And finally, if all you have to do is take the source csv, filter out records on ConnectedOn and write the result, you don't need to load the whole source in memory, create a filtered list (in memory again) and write the filtered list, you can stream the whole operation:
def filter_csv(source_path, dest_path, date):
fieldnames = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
target = parser.parse(date)
with open(source_path, "rb") as source, open(dest_path, "wb") as dest:
reader = csv.DictReader(source)
writer = csv.DictWriter(dest, fieldnames)
# if you want a header line with the fieldnames - else comment it out
writer.writeheaders()
for row in reader:
row_date = parser.parse(row["ConnectedOn"])
if row_date > target:
writer.writerow(row)
And here you are, plain and simple.
NB : I don't know what "parser.parse()" is but as others answers mention, you'd probably be better using the datetime module instead.

For filtering you could use filter() function:
def filter_by_date(connections):
target_date = datetime.strptime("01/04/2017", '%Y/%m/%d').date()
return list(filter(lambda x: x["ConnectedOn"] > target_date, connections))
And instead of creating simple dict and then fill its values into OrderedDict you could write values directly to the OrderedDict:
for row in reader:
od = OrderedDict()
od["FirstName"] = row["FirstName"]
od["LastName"] = row["LastName"]
od["EmailAddress"] = row["EmailAddress"]
od["Company"] = row["Company"]
od["ConnectedOn"] = datetime.strptime(row["ConnectedOn"], '%Y/%m/%d').date()
linkedin_contacts.append(od)
If you know date format you don't need python_dateutil, you could use built-in datetime.datetime.strptime() with needed format.

Because you don't precise the format string.
Use :
from datetime import datetime
format = '%d/%m/%Y'
date_text = '01/04/2017'
# inverse by datetime.strftime(format)
datetime.strptime(date_text, format)
#....
# with format as global
for row in reader:
od = OrderedDict()
od["FirstName"] = row["FirstName"]
od["LastName"] = row["LastName"]
od["EmailAddress"] = row["EmailAddress"]
od["Company"] = row["Company"]
od["ConnectedOn"] = strptime(row["ConnectedOn"], format)
linkedin_contacts.append(od)
Do:
def filter_by_date(connections, date_text):
target_date = datetime.strptime(date_text, format)
return [x for x in connections if x["ConnectedOn"] > target_dat]

yaml.dump is dumping in a dict, and I don't want it to do that

I was trying to make a tool that updates yaml values in files that have "PENDING" in them. It does work, but I need it to be formatted like this:
fields:
setName: ("name")
WishName: ("name")
WishNameState: ("PENDING")
However, it wants to dump it in this format:
fields: {WishName: ("name"), WishNameState: ("APPROVED"), setName: ("name")}
How can I make it dump in the format I want it to?
Here's my code, so you know how I'm currently doing it:
import glob
import os
import yaml
def processFile(f,t):
data = open(f,'rb').read()
lines = data.replace('\r\n','\n').split('\n')
lines_found = []
for i,x in enumerate(lines):
if t in x:
lines_found.append(i+1)
return lines_found
term = 'PENDING'
for x in glob.glob('*.yaml'):
r = processFile(x,term)
if r:
with open(x) as f:
yamlfile = yaml.load(f)
fields = yamlfile['fields']
name = fields['WishName']
print('Name: ' + name)
print('Approve or reject?')
aor = raw_input('a/r: ')
if aor == 'a':
fields['setName'] = name
fields['WishNameState'] = '("APPROVED")'
with open(x, "w") as f:
yaml.dump(yamlfile, f)
elif aor == 'r':
fields['WishNameState'] = '("REJECTED")'
with open(x, "w") as f:
yaml.dump(yamlfile, f)
else:
'Invalid response. Shutting down...'
sys.exit()
print('End of results!')
Any and all help is appreciated! Thanks :)

In your code, replace
yaml.dump(yamlfile, f)
with
yaml.dump(yamlfile, f, default_flow_style=False)

Extract tags from one column in CSV using Python [duplicate]

This question already has answers here:
Parsing out single column from csv into text file using python
(3 answers)
Closed 8 years ago.
I am trying to extract tagged entities from a csv file using python. This file contains tagged entities in multiple columns of the csv file. I only want python to process one specific column. Can anybody show me how to do this?
This is my code:
from bs4 import BeautifulSoup
import csv
input_name = "file.csv" # File names for input and output
output_name = "entities.csv"
def incrementEntity(entity_string, dictionary):
try:
dictionary[entity_string] += 1
except KeyError:
dictionary[entity_string] = 1
def outputResults(dictionary, entity_type, f):
for i in sorted(dictionary, key=dictionary.get, reverse=True):
print i, '\t', entity_type, '\t', dictionary[i]
f.writerow([i, entity_type, dictionary[i]])
try:
f = open(input_name, 'r')
soup = BeautifulSoup(f)
f.close()
except IOError, message:
print message
raise ValueError("Input file could not be opened")
locations = {}
people = {}
orgs = {}
for i in soup.find_all():
entity_name = i.get_text()
entity_type = i.name
if (entity_type == 'i-loc' or entity_type == 'b-loc'):
incrementEntity(entity_name, locations)
elif (entity_type == 'b-org' or entity_type == 'i-org'):
incrementEntity(entity_name, orgs)
elif (entity_type == 'b-per' or entity_type == 'i-per'):
incrementEntity(entity_name, people)
else:
continue
output_file = open(output_name, 'w')
f = csv.writer(output_file)
print "Entity\t\tType\t\tCount"
print "------\t\t----\t\t-----"
f.writerow(["Entity", "Type", "Count"])
outputResults(locations, 'location', f)
outputResults(people, 'person', f)
outputResults(orgs, 'organization', f)
output_file.close()

By definition, a CSV is a file in which data is separated by commas. So all you have to do is use the .split() method of the string you are dealing with.
Example:
csvline = 'Joe,25,M'
age = csvline.split(',')[1]
I don't know exactly what kind of data you are trying to process, but since you are trying to use BeautifulSoup I will assume your CSV file contains plain HTML-like data in some of its columns AND that you want to join the data of all those columns to process it with BeautifulSoup. That being the case you could try something like:
f = open(input_name, 'r')
htmlstring = '\n'.join([line.split(',')[1] for line in f])
soup = BeautifulSoup(htmlstring)
f.close()

reading files python

I want to read files in an advanced mode.
First:
In this file, I have certain steps with which the code has to follow, how do I read the steps until the string [data] appears.
[Steps]
step1 = WebAddress
step2 = Tab
step3 = SecurityType
step4 = Criteria
step5 = Date
step6 = Click1
step7 = Results
step8 = Download
[data]
......
Second:
How can I read all everything after [data].
[data]
WebAddress___________________________ Destination___________ Tab_____________ SecurityType___________________________________________________ Criteria___ Date_______ Click1_ Results_ Download
https://mbsdisclosure.fanniemae.com/ q:\\%s\\raw\\fnmapool Advanced Search Interim MBS: Single-Family Issue Date 09/01/2012 Search 100 CSV XML
https://mbsdisclosure.fanniemae.com/ q:\\%s\\raw\\fnmapool Advanced Search Preliminary Mega: Fannie Mae/Ginnie Mae backed Adjustable Rate Issue Date 09/01/2012 Search 100 CSV XML
https://mbsdisclosure.fanniemae.com/ q:\\%s\\raw\\fnmapool Advanced Search Preliminary Mega: Fannie Mae/Ginnie Mae backed Fixed Rate Issue Date 09/01/2012 Search 100 CSV XML
I want to pass everything under the step____________________ where step can be the steps(e.g. WebAddress).
So for example, if step1 = WebAddress how do I read everything under WebAddress__________________________ and so on? Thanks!

First:
with open(file_name) as f:
print (f.read()).split("[data]")
Second:
with open(file_name) as f:
pre_data,post_data =[s.strip() for s in (f.read()).split("[data]")]
post_data_lines = post_data.splitlines()
headers = post_data_lines[0].split()
print headers
for line in post_data_lines[1:]:
print line.split()
print dict(zip(headers,line.split()))
Im also not sure how your [data]is delimited you may want line.split('\t') if its tabbed
this is untested... but it should work and it doesnt quite get you all the way where you want but at least it gets most of what your want (the "hard" parts)
to split by header width use
file_name = "testdata.txt"
with open(file_name) as f:
pre_data,post_data =[s.strip() for s in (f.read()).split("[data]")]
post_data_lines = post_data.splitlines()
headers = post_data_lines[0].split()
for line in post_data_lines[1:]:
tmpline = []
pos = 0
for itm in headers:
tmpline.append(line[pos:pos+len(itm)])
pos += len(itm)+1
print dict(zip(headers,tmpline))
and if you want the actual header with out the __'s then use
file_name = "testdata.txt"
with open(file_name) as f:
pre_data,post_data =[s.strip() for s in (f.read()).split("[data]")]
post_data_lines = post_data.splitlines()
headers = post_data_lines[0].split()
headers2 = [s.replace("_"," ").strip() for s in headers]
for line in post_data_lines[1:]:
tmpline = []
pos = 0
for itm in headers:
tmpline.append(line[pos:pos+len(itm)])
pos += len(itm)+1
print dict(zip(headers2,tmpline))

First step:
>>> import ConfigParser
>>> cfg = ConfigParser.RawConfigParser()
>>> with open('sample.cfg') as f:
... cfg.readfp(f)
...
>>> cfg.get('Steps','step1')
'WebAddress'
Second step:
>>> data_section = ''
>>> with open('sample.cfg') as f:
... data_section = f.read()
...
>>> data = data_section[data_section.index('[data]')+len('[data]')+1:]
>>> reader = csv.reader(io.BytesIO(data),delimiter='\t')
>>> reader.next() # skips header
>>> results = [row for for row in reader]
Now results is a list of lists, with each inner list having items from the data section.
[['https://mbsdisclosure.fanniemae.com/','q:\\\\%s\\\\raw\\\\fnmapool','Advanced Search', 'Interim MBS: Single-Family', 'Issue Date','09/01/2012','Search','100', 'CSV XML']...]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Removing Duplicate CSV Entries with python - python

Related

Parsing csv file and splitting into sub files

extract record by csv and filtering by date

yaml.dump is dumping in a dict, and I don't want it to do that

Extract tags from one column in CSV using Python [duplicate]

reading files python

Categories

Resources