getting max value from each column of the csv file - python

Would anybody help me to solve the following problem. I have tried it on my own and I have attached the solution also. I have used 2-d list, but I want a different solution without 2-d list, which should be more pythonic.
pl suggest me any of you have any other way of doing this.
Q) Consider Share prices for a N number of companies given for each month since year 1990 in a CSV file. Format of the file is as below with first line as header.
Year,Month,Company A, Company B,Company C, .............Company N
1990, Jan, 10, 15, 20, , ..........,50
1990, Feb, 10, 15, 20, , ..........,50
.
.
.
.
2013, Sep, 50, 10, 15............500
The solution should be in this format.
a) List for each Company year and month in which the share price was highest.
Here is my answer using 2-d list.
def generate_list(file_path):
'''
return list of list's containing file data.'''
data_list=None #local variable
try:
file_obj = open(file_path,'r')
try:
gen = (line.split(',') for line in file_obj) #generator, to generate one line each time until EOF (End of File)
for j,line in enumerate(gen):
if not data_list:
#if dl is None then create list containing n empty lists, where n will be number of columns.
data_list = [[] for i in range(len(line))]
if line[-1].find('\n'):
line[-1] = line[-1][:-1] #to remove last list element's '\n' character
#loop to convert numbers from string to float, and leave others as strings only
for i,l in enumerate(line):
if i >=2 and j >= 1:
data_list[i].append(float(l))
else:
data_list[i].append(l)
except IOError, io_except:
print io_except
finally:
file_obj.close()
except IOError, io_exception:
print io_exception
return data_list
def generate_result(file_path):
'''
return list of tuples containing (max price, year, month,
company name).
'''
data_list = generate_list(file_path)
re=[] #list to store results in tuple formet as follow [(max_price, year, month, company_name), ....]
if data_list:
for i,d in enumerate(data_list):
if i >= 2:
m = max(data_list[i][1:]) #max_price for the company
idx = data_list[i].index(m) #getting index of max_price in the list
yr = data_list[0][idx] #getting year by using index of max_price in list
mon = data_list[1][idx] #getting month by using index of max_price in list
com = data_list[i][0] #getting company_name
re.append((m,yr,mon,com))
return re
if __name__ == '__main__':
file_path = 'C:/Document and Settings/RajeshT/Desktop/nothing/imp/New Folder/tst.csv'
re = generate_result(file_path)
print 'result ', re
I have tried to solve it with generator also, but in that case it was giving result for only one company i.e. only one column.
p = 'filepath.csv'
f = open(p,'r')
head = f.readline()
gen = ((float(line.split(',')[n]), line.split(',',2)[0:2], head.split(',')[n]) for n in range(2,len(head.split(','))) for i,line in enumerate(f))
x = max((i for i in gen),key=lambda x:x[0])
print x
you can take the below provided input data which is in csv format..
year,month,company 1,company 2,company 3,company 4,company 5
1990,jan,201,245,243,179,133
1990,feb,228,123,124,121,180
1990,march,63,13,158,88,79
1990,april,234,68,187,67,135
1990,may,109,128,46,185,236
1990,june,53,36,202,73,210
1990,july,194,38,48,207,72
1990,august,147,116,149,93,114
1990,september,51,215,15,38,46
1990,october,16,200,115,205,118
1990,november,241,86,58,183,100
1990,december,175,97,143,77,84
1991,jan,190,68,236,202,19
1991,feb,39,209,133,221,161
1991,march,246,81,38,100,122
1991,april,37,137,106,138,26
1991,may,147,48,182,235,47
1991,june,57,20,156,38,245
1991,july,165,153,145,70,157
1991,august,154,16,162,32,21
1991,september,64,160,55,220,138
1991,october,162,72,162,222,179
1991,november,215,207,37,176,30
1991,december,106,153,31,247,69
expected output is following.
[(246.0, '1991', 'march', 'company 1'),
(245.0, '1990', 'jan', 'company 2'),
(243.0, '1990', 'jan', 'company 3'),
(247.0, '1991', 'december', 'company 4'),
(245.0, '1991', 'june', 'company 5')]
Thanks in advance...

Using collections.OrderedDict and collections.namedtuple:
import csv
from collections import OrderedDict, namedtuple
with open('abc1') as f:
reader = csv.reader(f)
tup = namedtuple('tup', ['price', 'year', 'month'])
d = OrderedDict()
names = next(reader)[2:]
for name in names:
#initialize the dict
d[name] = tup(0, 'year', 'month')
for row in reader:
year, month = row[:2] # Use year, month, *prices = row in py3.x
for name, price in zip(names, map(int, row[2:])): # map(int, prices) py3.x
if d[name].price < price:
d[name] = tup(price, year, month)
print d
Output:
OrderedDict([
('company 1', tup(price=246, year='1991', month='march')),
('company 2', tup(price=245, year='1990', month='jan')),
('company 3', tup(price=243, year='1990', month='jan')),
('company 4', tup(price=247, year='1991', month='december')),
('company 5', tup(price=245, year='1991', month='june'))])

I wasn't entirely sure how you wanted to output so for now I just have it print the output to screen.
import os
import csv
import codecs
## Import data !!!!!!!!!!!! CHANGE TO APPROPRIATE PATH !!!!!!!!!!!!!!!!!
filename= os.path.expanduser("~/Documents/PYTHON/StackTest/tailor_raj/Workbook1.csv")
## Get useable data
data = [row for row in csv.reader(codecs.open(filename, 'rb', encoding="utf_8"))]
## Find Number of rows
row_count= (sum(1 for row in data)) -1
## Find Number of columns
## Since this cannot be explicitly done, I set it to run through the columns on one row until it fails.
## Failure is caught by try/except so the program does not crash
columns_found = False
column_try =1
while columns_found == False:
column_try +=1
try:
identify_column = data[0][column_try]
except:
columns_found=True
## Set column count to discoverd column count (1 before it failed)
column_count=column_try-1
## Set which company we are checking (start with the first company listed. Since it starts at 0 the first company is at 2 not 3)
companyIndex = 2
#This will keep all the company bests as single rows of text. I was not sure how you wanted to output them.
companyBest=[]
## Set loop to go through each company
while companyIndex <= (column_count):
## For each new company reset the rowIndex and highestShare
rowIndex=1
highestShare=rowIndex
## Set loop to go through each row
while rowIndex <=row_count:
## Test if data point is above or equal to current max
## Currently set to use the most recent high point
if int(data[highestShare][companyIndex]) <= int(data[rowIndex][companyIndex]):
highestShare=rowIndex
## Move on to next row
rowIndex+=1
## Company best = Company Name + year + month + value
companyBest.append(str(data[0][companyIndex])+": "+str(data[highestShare][0]) +", "+str(data[highestShare][1])+", "+str(data[highestShare][companyIndex]))
## Move on to next company
companyIndex +=1
for item in companyBest:
print item
Be sure to change your filename path one more appropriate.
Output is currently displayed like this:
Company A: 1990, Nov, 1985
Company B: 1990, May, 52873
Company C: 1990, May, 3658
Company D: 1990, Nov, 156498
Company E: 1990, Jul, 987

No generator unfortunately but small code size, especially in Python 3:
from operator import itemgetter
from csv import reader
with open('test.csv') as f:
year, month, *data = zip(*reader(f))
for pricelist in data:
name = pricelist[0]
prices = map(int, pricelist[1:])
i, price = max(enumerate(prices), key=itemgetter(1))
print(name, price, year[i+1], month[i+1])
In Python 2.X you can do the same thing but slightly more clumsy, using the following (and the different print statement):
with open('test.csv') as f:
columns = zip(*reader(f))
year, month = columns[:2]
data = columns[2:]
Okay I came up with some gruesome generators! Also it makes use of lexicographic tuple comparison and reduce to compare consecutive lines:
from functools import reduce # only in Python 3
import csv
def group(year, month, *prices):
return ((int(p), year, month) for p in prices)
def compare(a, b):
return map(max, zip(a, group(*b)))
def run(fname):
with open(fname) as f:
r = csv.reader(f)
names = next(r)[2:]
return zip(names, reduce(compare, r, group(*next(r))))
list(run('test.csv'))

Related

Web Scraping Using Beautiful Soup (prints data group listed into one cell instead of across individual cells)

Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame and webpages like it to all be in their own cell across the row in a csv file. currently prints listed into one cell.
Ideal format would be
Name
Year
PER #
GP
Min
etc..
name
2003
Per#
75
39.4
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
names_list = ["name", "lots", "of", "names", "defined", "earlier"] # names of players
pers = ['Per#', "lots", "of", "PER numbers (integers) ", "defined", "earlier"] # player efficency ratings
statsurls = ['https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'] # 100s of players pages in this same format (found by going through the 'id/#' and 'year/#')
statslist = []
def GetStats(statsurl):
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
t = statssoup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
split_list = [elem.split() for elem in all_splits]
statslist.append(split_list)
with ThreadPoolExecutor() as executor:
RunStats = [executor.submit(GetStats, statsurl) for statsurl in statsurls]
names_stats = pd.DataFrame(zip(names_list, pers, statslist))
names_stats.to_csv('names&stats.csv', index=False)
I think the issue is that names_list and pers are flat lists but statslist is list of lists, so just zipping the 3 together directly results in a list of tuples like
[
(name0, pers0, all_splits_0_LIST),
(name1, pers1, all_splits_1_LIST),
(name2, pers2, all_splits_2_LIST),
## AND SO ON...
]
instead you might want to do something like
names_stats = pd.DataFrame([tuple([n, p] + s) for n, p, s in zip(names_list, pers, statslist)])
I don't think zipping separately obtained lists together is the best way to ensure that the right data is matched up to the right name...wouldn't it be be more reliable to get the name and person# from statssoup and statsurl?
def GetStats(statsurl):
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
t = statssoup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
split_list = [elem.split() for elem in all_splits]
pers = statsurl.split('/id/', 1)[-1].split('.', 1)[0]
yr = statsurl.split('/year/', 1)[-1].split('.', 1)[0]
# name = statssoup.select_one('h1.PlayerHeader__Name').get_text(' ').strip() # not as safe as:
name = ' '.join([n.get_text(' ').strip() for n in statssoup.select('h1.PlayerHeader__Name') if n.get_text().strip()])
statslist.append([name, yr, pers] + split_list)
and then you can just form the DataFrame with names_stats = pd.DataFrame(statslist).

How to parse values from a text file into a list, while filling missing values with None?

I have a text file of raw data I am parsing.
There are certain codes in there that indicate the field.
These values will go into lists that then go into dataframe in pandas and eventually a database
For example a small portion with 2 records looks like:
INS*Y*18*001*AL*A*E**AC**N~
REF*1L*690553677~
DTP*348*D8*20200601~
DTP*349*D8*20200630~
HD*024**FAC*KJ/165/////1M*IND~
INS*Y*18*001*AL*A*E**AC**N~
REF*1L*6905456455~
DTP*348*D8*20200601~
HD*024**FAC*KJ/165/////1M*IND~
"DTP" indicates a date, and 348 means a start_date and 349 indicates an end_date.
Each group of line corresponds to a member in membership data.
The "REF" is the line with the Members membership number.
"INS" indicates its a new member or record in the database.
Some members don't have an end_date line "DTP*349" like our second record.
These should append to the end_date list with "" to hold a place as a null
While looping through each line, look for where the line starts with the code I want, and splits the line, and takes the specified element.
How do I account for where a certain field is missing in the loop, so that if a member has an end_date or not, there will be a value in that members index place, so it can all be put in a pandas dataframe?
My code thus far look like:
membership_type=[]
member_id=[]
startDate = []
endDate = []
with open(path2 + fileName, "r") as txtfile:
for line in txtfile:
# Member type
if line.startswith("INS*"):
line.split("*")
membership_type.extend(line[4]
# Member ID
if line.startwith("REF*"):
line.split("*")
member_id.extend(line[2])
# Start Dates
if line.startswith("DTP*348*"):
line = line.split("*")
start_date.extend(line[3])
# End Dates
'''What goes here?'''
Results should look like:
print(membership_type)
['AL','AL']
print(member_id)
['690553677','690545645']
print(startDate)
['20200601','20200601']
print(endDate)
['20200630','']
Every record will have a INS and REF and HD field
Use readlines to get all the rows of strings
Clean rows of text and then use re.split to split on multiple items, * and / in this case.
Splitting on / will properly separate unique items in the string, but will also create blank spaces to be removed.
Use enumerate on each row
With the entire list of rows, you can see the current index, i, but i + or - a number can be used to also compare a different row.
If the next row after DTP 348 isn't DTP, then add None or ''.
Filling the blanks with None to facilitate converting to a datetime format in pandas.
Remember, line is one row in lines, where each line is enumerated with i. The current line is lines[i] and the next line is lines[i + 1].
import re
membership_type = list()
member_id = list()
start_date = list()
end_date = list()
name = list()
first_name = list()
middle_name = list()
last_name = list()
with open('test.txt', "r") as f:
lines = [re.split('\*|/', x.strip().replace('~', '')) for x in f.readlines()] # clean and split each row
lines = [[i for i in l if i] for l in lines] # remove blank spaces
for i, line in enumerate(lines):
print(line) # only if you want to see
# Member type
if line[0] == "INS":
membership_type.append(line[4])
# Member ID
elif line[0] == 'REF':
member_id.append(line[2])
# Start Dates
elif (line[0] == 'DTP') and (line[1] == '348'):
start_date.append(line[3])
if (lines[i + 1][0] != 'DTP'): # the next line should be the end_date, if it's not, add None
end_date.append(None)
# End Dates
elif (line[0] == 'DTP') and (line[1] == '349'):
end_date.append(line[3])
# Names
elif line[0] == 'NM1':
name.append(' '.join(line[3:]))
first_name.append(line[3])
middle_name.append(line[4])
last_name.append(line[5])
try:
some_list.append(line[6])
except IndexError:
print('No prefix')
some_list.append(None)
try:
some_list.append(line[7])
except IndexError:
print('No suffix')
some_list.append(None)
print(membership_type)
print(member_id)
print(start_date)
print(end_date)
print(name)
print(first_name)
print(middle_name)
print(last_name)
['AL', 'AL']
['690553677', '6905456455']
['20200601', '20200601']
['20200630', None]
['SMITH JOHN PAUL MR JR', 'IMA MEAN TURD MR SR']
['SMITH', 'IMA']
['JOHN', 'MEAN']
['PAUL', 'TURD']
Load into pandas
import pandas as pd
data = {'start_date': start_date , 'end_date': end_date, 'member_id': member_id, 'membership_type': membership_type,
'name': name, 'first_name': first_name, 'middle_name': middle_name, 'last_name': last_name}
df = pd.DataFrame(data)
# convert datetime columns
df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)
# display df
start_date end_date member_id membership_type name first_name middle_name last_name
0 2020-06-01 2020-06-30 690553677 AL SMITH JOHN PAUL MR JR SMITH JOHN PAUL
1 2020-06-01 NaT 6905456455 AL IMA MEAN TURD MR SR IMA MEAN TURD
Contents of test.txt
NM1*IL*1*SMITH*JOHN*PAUL*MR*JR~
INS*Y*18*001*AL*A*E**AC**N~
REF*1L*690553677~
DTP*348*D8*20200601~
DTP*349*D8*20200630~
HD*024**FAC*KJ/165/////1M*IND~
NM1*IL*1*IMA*MEAN*TURD*MR*SR~
INS*Y*18*001*AL*A*E**AC**N~
REF*1L*6905456455~
DTP*348*D8*20200601~
HD*024**FAC*KJ/165/////1M*IND~

Summing up datetimes without using pandas

I have a data set of rain fall in half hour intervals. I want to sum up the rainfall for each day and keep track of how many data points are summed per day to account for data gaps. Then I want to create a new file with a column for the date, a column for the rainfall, and a column for how many data points were available to sum for each day.
daily sum is my function that is trying to do this, get data is my function for extracting the data.
def get_data(avrains):
print('opening{}'.format(avrains))
with open(avrains, 'r') as rfile:
header = rfile.readline()
dates = []
rainfalls = []
for line in rfile:
line = (line.strip())
row = line.split(',')
d = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
r = row[-1]
dates.append(d)
rainfalls.append(float(r))
data = zip(dates, rainfalls)
data = sorted(data)
return (data)
def dailysum(rains):
day_date = []
rain_sum = []
for i in rains:
dayi = i[0]
rainsi = i[1]
for i in dayi:
try:
if dayi[i]== dayi[i+1]:
s= rains[i]+rains[i+1]
rain_sum.append(float(s))
except:
pass
day_date.append(dayi[i])
There's a lot of ways to solve this, but I'll try to stay as close to your existing code as I can:
def get_data(avrains):
"""
opens the file specified in avrains and returns a dictionary
keyed by date, containing a 2-tuple of the total rainfall and
the count of data points, like so:
{
date(2018, 11, 1) : (0.25, 6),
date(2018, 11, 2) : (0.00, 5),
}
"""
print('opening{}'.format(avrains))
rainfall_totals = dict()
with open(avrains, 'r') as rfile:
header = rfile.readline()
for line in rfile:
line = (line.strip())
row = line.split(',')
d = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
r = row[-1]
try:
daily_rainfall, daily_count = rainfalls[d]
daily_rainfall += r
daily_count += 1
rainfalls[d] = (daily_rainfall, daily_count)
except KeyError:
# if we don't find that date in rainfalls, add it
rainfalls[d] = (r, 1)
return rainfalls
Now when you call get_data("/path/to/file"), you'll get back a dictionary. You can spit out the values with some thing like this:
foo = get_data("/path/to/file")
for (measure_date, (rainfall, observations)) in foo.items():
print measure_date, rainfall, observations
(I will leave the formatting of the date, and any sorting or file-writing as an exercise :) )

Appending values in rows into new columns?

I have this data in a CSV:
first, middle, last, id, fte
Alexander,Frank,Johnson,460700,1
Ashley,Jane,Smith,470000,.5
Ashley,Jane,Smith,470000,.25
Ashley,Jane,Smith,470000,.25
Steve,Robert,Brown,460001,1
I need to find the rows of people with the same ID numbers, and then combine the FTEs of those rows into the same line. I'll also need to add 0s for the rows that don't have duplicates. For example (using data above):
first, middle, last, id, fte1, fte2, fte3, fte4
Alexander,Frank,Johnson,460700,1,0,0,0
Ashley,Jane,Smith,470000,.5,.25,.25,0
Steve,Robert,Brown,460001,1,0,0,0
Basically, we're looking at the jobs people hold. Some people work one 40-hour per week job (1.0 FTE), some work two 20-hour per week jobs (.5 and .5 FTEs), some might work 4 10-hour per week jobs (.25, .25, .25, and .25 FTEs), and some may have other combinations. We only get one row of data per employee, so we need the FTEs on the same row.
This is what we have so far. Right now, our current code only works if they have two FTEs. If they have three or four, it just overwrites them with the last two (so if they have 3, it gives us 2 and 3. If they have 4, it gives us 3 and 4).
f = open('data.csv')
csv_f = csv.reader(f)
dataset = []
for row in csv_f:
dictionary = {}
dictionary["first"] = row[2]
dictionary["middle"] = row[3]
dictionary["last"] = row[4]
dictionary["id"] = row[10]
dictionary["fte"] = row[12]
dataset.append(dictionary)
def is_match(dict1, dict2):
return (dict1["id"] == dict2["id"])
def find_match(dictionary, dict_list):
for index in range(0, len(dict_list)):
if is_match(dictionary, dict_list[index]):
return index
return -1
def process_data(dataset):
result = []
for index in range(1, len(dataset)):
data_dict = dataset[index]
match_index = find_match(data_dict, result)
id = str(data_dict["id"])
if match_index == -1:
result.append(data_dict)
else:
(result[match_index])["fte2"] = data_dict["fte"]
return result
f.close()
for row in process_data(dataset):
print(row)
Any help would be extremely appreciated! Thanks!
I would say to use the pandas library to make it simple. You can use group by along with aggregation. The below is an example following the aggregation example provided here https://www.tutorialspoint.com/python_pandas/python_pandas_groupby.htm.
import pandas as pd
import numpy as np
df = pd.read_csv('filename.csv')
grouped = df.groupby('id')
print grouped['fte'].agg(np.sum)

trying to create a dictionary from a text file

fieldict(filename) reads a file in DOT format and
returns a dictionary with the DOT CMPLID, converted to an
integer, as the key, and a tuple as the corresponding value
for that key. The format of the tuple is:
(manufacturer, date, crash, city, state)
fieldict("DOT500.txt")[416]
('DAIMLERCHRYSLER CORPORATION', datetime.date(1995, 1, 9), False, 'ARCADIA',
so far, I have tried
from collections import defaultdict
import datetime
def fieldict(filename):
with open(filename) as f:
x=[line.split('\t')[0].strip() for line in f] #list of complaint numbers
y= line.split('\t') #list of full complaints
d={}
for j in x:
Y= True
N= False
d[j] = tuple(y[2],datetime.date(y[7]), y[6], y[12], y[13]) #dict with number of complaint as key and tuple with index as values
return d
No luck... I think I am close..any help is greatly appreciated
EDIT: each complaint is formatted like this
'11\t958128\tDAIMLERCHRYSLER CORPORATION\tDODGE\tSHADOW\t1990\tY\t19941117\tN\t0\t0\tENGINE AND ENGINE COOLING:ENGINE\tWILMINGTON \tDE\t1B3XT44KXLN\t19950103\t19950103\t\t1\tENGINE MOTOR MOUNTS FAILED, RESULTING IN ENGINE NOISE. *AK\tEVOQ\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tV\t\r\n'
Entry without character marks showing :
11 958128 DAIMLERCHRYSLER CORPORATION DODGE SHADOW 1990 Y 19941117 N 0 0 ENGINE AND ENGINE COOLING:ENGINE WILMINGTON DE 1B3XT44KXLN 19950103 19950103 1 ENGINE MOTOR MOUNTS FAILED, RESULTING IN ENGINE NOISE. *AK EVOQ
Looks like you want to make friends with the csv module, as this looks like tab formatted csv text. The csv.reader() has a .next() method which is called when you throw it in a for loop, so you can go line by line through the file.
As a general tip, read PEP8, and use understandable variable names.
With python, if it starts to feel hard that's a good sign that there usually is a better way.
import csv
import datetime
def _build_datetime(line)
year_idx = x
month_idx = y
day_idx = z
indexes = (year_idx, month_idx, day_idx)
result_datetime = None
if all(line[idx] for idx in indexes): # check that expected values are populated
int_values = [int(line[idx]) for idx in indexes]
result_datetime = datetime.date(*int_values)
return result_datetime
def format2dict(filename):
complaints = {}
with open(filename, "rb") as in_f:
reader = csv.reader(in_f, delimiter='\t')
complaint_id_idx = 0
manufacturer_idx = 2
crash_idx = x
city_idx = 12
state_idx = 13
for line in reader:
complaint_id = int(line[complaint_id_idx])
data= (
line[manufacturer_idx],
_build_datetime(line),
line[crash_idx],
line[city_idx],
line[state_idx],
)
complaints[complaint_id] = data
return complaints
if __name__ == "__main__":
formatted_data = format2dict("DOT500.txt")
Note: Trimming the newline is left up to the reader.
A clean way of accomplishing this is to use dict(zip(headers,data_list))
Presuming your sample data looks like
joe\tSan Francisco\tapple
frank\tNew York City\torange
tim\tHawaii\tpineapple
You could do something like:
results = []
headers = ['person','place','fruit']
for line in open('datafile.txt').readlines():
record = line.split('\t')
results.append(dict(zip(headers,record)))
Which will make a dict for each line and append it to the end of 'results'.
Looking like:
[{'fruit': 'apple\n', 'person': 'joe', 'place': 'San Francisco'},
{'fruit': 'orange\n', 'person': 'frank', 'place': 'New York City'},
{'fruit': 'pineapple\n', 'person': 'tim', 'place': 'Hawaii'}]
You're on the right track with line.split('\t') to break up text into pieces. Try something like this to build up the tuple from the split pieces.
import datetime
a = '11\t958128\tDAIMLERCHRYSLER CORPORATION\tDODGE\tSHADOW\t1990\tY\t19941117\tN\t0\t0\tENGINE AND ENGINE COOLING:ENGINE\tWILMINGTON \tDE\t1B3XT44KXLN\t19950103\t19950103\t\t1\tENGINE MOTOR MOUNTS FAILED, RESULTING IN ENGINE NOISE. *AK\tEVOQ\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tV\t'
fields = a.split('\t')
recordNum = fields[0]
mfr = fields[2]
recDate = datetime.date(int(fields[5]),1,2)
make = fields[4]
DOTrecord = recordNum,mfr, recDate,make
print DOTrecord

Categories