Extract letters after $ symbol using Pandas - python
I am trying to extract just the data upto and including the $ symbol from a spreadsheet.
I have isolated the data to give me just the column containing the data but what I am trying to do is extract any and all symbols that follow a $ symbol.
For example:
$AAPL $LOW $TSLA and so on from the entire dataset but I don't need or want $1000 $600 and so on - just letters only and either a period or a space follows but just the characters a-z is what I am trying to get.
I haven't been successful in full extraction and my code is starting to get messy so I'll provide the code that will bring back the data for you to see for yourself. I am using Jupyter Notebook.
import mysql.connector
import pandas
googleSheedID = '15fhpxqWDRWkNtEFhi9bQyWUg8pDn4B-R2N18s1xFYTU'
worksheetName = 'Sheet1'
URL = 'https://docs.google.com/spreadsheets/d/{0}/gviz/tq?tqx=out:csv&sheet={1}'.format(
googleSheedID,
worksheetName
)
df = pandas.read_csv(URL)
del df['DATE']
del df['USERNAME']
del df['LINK']
del df['LINK2']
df[df["TWEET"].str.contains("RT")==False]
print(df)
Not sure if I understand what you want correctly, but the following codes give all elements that comes after $ before (blank space).
import mysql.connector
import pandas
googleSheedID = '15fhpxqWDRWkNtEFhi9bQyWUg8pDn4B-R2N18s1xFYTU'
worksheetName = 'Sheet1'
URL = 'https://docs.google.com/spreadsheets/d/{0}/gviz/tq?tqx=out:csv&sheet={1}'.format(
googleSheedID,
worksheetName
)
df = pandas.read_csv(URL)
del df['DATE']
del df['USERNAME']
del df['LINK']
del df['LINK2']
unique_results = []
for i in range(len(df['TWEET'])):
if 'RT' in df["TWEET"][i]:
continue
else:
for j in range(len(df['TWEET'][i])-1):
if df['TWEET'][i][j] == '$':
if df['TWEET'][i][j+1] == '1' or df['TWEET'][i][j+1] == '2' or df['TWEET'][i][j+1] == '3' or\
df['TWEET'][i][j+1] == '4' or df['TWEET'][i][j+1] == '5' or df['TWEET'][i][j+1] == '6' or\
df['TWEET'][i][j+1] == '7' or df['TWEET'][i][j+1] == '8' or df['TWEET'][i][j+1] == '9' or df['TWEET'][i][j+1] == '0':
continue
else:
start = j
for k in range(start, len(df['TWEET'][i])):
if df['TWEET'][i][k] == ' ' or df['TWEET'][i][k:k+1] == '\n':
end = k
break
results = df['TWEET'][i][start:end]
if results not in unique_results:
unique_results.append(results)
print(unique_results)
edit: fixed the code
The outputs are:
['$GME', '$SNDL', '$FUBO', '$AMC', '$LOTZ', '$CLOV', '$USAS', '$AIHS', '$PLM', '$LODE', '$TTNP', '$IMTE', '', '$NAK.', '$NAK', '$CRBP', '$AREC', '$NTEC', '$NTN', '$CBAT', '$ZYNE', '$HOFV', '$GWPH', '$KERN', '$ZYNE,', '$AIM', '$WWR', '$CARV', '$VISL', '$SINO', '$NAKD', '$GRPS', '$RSHN', '$MARA', '$RIOT', '$NXTD', '$LAC', '$BTC', '$ITRM', '$CHCI', '$VERU', '$GMGI', '$WNBD', '$KALV', '$EGOC', '$Veru', '$MRNA', '$PVDG', '$DROP', '$EFOI', '$LLIT', '$AUVI', '$CGIX', '$RELI', '$TLRY', '$ACB', '$TRCH', '$TRCH.', '$TSLA', '$cciv', '$sndl', '$ANCN', '$TGC', '$tlry', '$KXIN', '$AMZN', '$INFI', '$LMND', '$COMS', '$VXX', '$LEDS', '$ACY', '$RHE', '$SINO.', '$GPL', '$SPCE', '$OXY', '$CLSN', '$FTFT', '$FTFT.....', '$BIEI', '$EDRY', '$CLEU', '$FSR', '$SPY', '$NIO', '$LI', '$XPEV,', '$UL', '$RGLG', '$SOS', '$QS', '$THCB', '$SUNW', '$MICT', '$BTC.X', '$T', '$ADOM', '$EBON', '$CLPS', '$HIHO', '$ONTX', '$WNRS', '$SOLO', '$Mara,', '$Riot,', '$SOS,', '$GRNQ,', '$RCON,', '$FTFT,', '$BTBT,', '$MOGO,', '$EQOS,', '$CCNC', '$CCIV', '$tsla', '$fsr', '$wkhs', '$ride', '$nio', '$NETE', '$DPW', '$MOSY', '$SSNT', '$PLTR', '$GSAH:', '$EQOS', '$MTSL', '$CMPS', '$CHIF', '$MU', '$HST', '$SNAP', '$CTXR', '$acy', '$FUBOTV', '$DPBE', '$HYLN', '$SPOT', '$NSAV', '$HYLN,', '$aabb', '$AAL', '$BBIG', '$ITNS', '$CTIB', '$AMPG', '$ZI', '$NUVI', '$INTC', '$TSM', '$AAPL', '$MRJT', '$RCMT', '$IZEA', '$BBIG,', '$ARKK', '$LIAUTO', '$MARA:', '$SOS:', '$XOM', '$ET', '$BRNW', '$SYPR', '$LCID', '$QCOM', '$FIZZ', '$TRVG', '$SLV', '$RAFA', '$TGCTengasco,', '$BYND', '$XTNT', '$NBY', '$sos', '$KMPH', '$', '$(0.60)', '$(0.64)', '$BIDU', '$rkt', '$GTT', '$CHUC', '$CLF', '$INUV', '$RKT', '$COST', '$MDCN', '$HCMC', '$UWMC', '$riot', '$OVID', '$HZON', '$SKT', '$FB', '$PLUG', '$BA', '$PYPL', '$PSTH.', '$NVDA', '$AMPG.', '$aese.', '$spy', '$pltr', '$MSFT', '$AMD', '$QQQ', '$LTNC', '$WKHS', '$EYES', '$RMO', '$GNUS', '$gme', '$mdmp', '$kern', '$AEI', '$BABA', '$YALA', '$TWTR', '$WISH', '$GE', '$ORCL', '$JUPW', '$TMBR', '$SSYS', '$NKE', '$AMPGAmpliTech', '$$$', '$$', '$RGLS', '$HOGE', '$GEGR', '$nclh', '$IGAC', '$FCEL', '$TKAT', '$OCG', '$YVR', '$IPDN.', '$IPDN', "$SINO's", '$WIMI', '$TKAT.', '$BAC', '$LZR', '$LGHL', '$F', '$GM', '$KODK', '$atvk', '$ATVK', '$AIKI', '$DS', '$AI', '$WTII', '$oxy', '$DYAI', '$DSS', '$ZKIN', '$MFH', '$WKEY', '$MKGI', '$DLPN', '$PSWW', '$SNOW', '$ALYA', '$AESE', '$CSCW', '$CIDM', '$HOFV.', '$LIVX', '$FNKO', '$HPR', '$BRQS', '$GIGM', '$APOP', '$EA', '$CUEN', '$TMBR?', '$FLNT,', '$APPS', '$METX', '$STG', '$WSRC', '$AMHC', '$VIAC', '$MO', '$UAVL', '$CS', '$MDT', '$GYST', '$CBBT', '$ASTC', '$AACG', '$WAFU.', '$WAFU', '$CASI', '$mmmw', '$MVIS', '$SNOA', '$C', '$KR', '$EWZ', '$VALE', '$EWZ.', '$CSCO', '$PINS', '$XSPA', '$VPRX', '$CEMI', '$M', '$BMRA', '$SPX', '$akt', '$SURG', '$NCLH', '$ARSN', '$ODT', '$SGBX', '$CRWD.', '$TGRR', '$PENN', '$BB', '$XOP', '$XL', '$FREQ', '$IDRA', '$DKNG', '$COHN', '$ADHC', '$ISWH', '$LEGO', '$OTRA', '$NAAC', '$HCAR', '$PPGH', '$SDAC', '$PNTM', '$OUST', '$IO', '$HQGE', '$HENC', '$KYNC', '$ATNF', '$BNSO', '$HDSN', '$AABB', '$SGH', '$BMY', '$VERY', '$EARS', '$ROKU', '$PIXY', '$APRE', '$SFET', '$SQ', '$EEIQ', '$REDU', '$CNWT', '$NFLX', '$RGBPP', '$RGBP', '$SHOP', '$VITL', '$RAAS', '$CPNG', '$JKS', '$COMP', '$NAFS']
You can use regular expressions.
\$[a-zA-Z]+
After reading the df execute the below code
import re
# Create Empty list for final results
results = []
final_results = []
for row_num in range(len(df['TWEET'])):
string_to_check = df['TWEET'][row_num]
# Check for RT at the beginning of the string only.
# if 'RT' in df["TWEET"][row_num] would have found the "RT" anywhere in the string.
if re.match(r"^RT", string_to_check):
continue
else:
# Check for all words starting with $ and followed by only alphabets.
# This will find $FOOBAR but not $600, $6FOOBAR & $FOO6BAR
rel_text_l = re.findall(r"\$[a-zA-Z]+", string_to_check)
# Check for empty list
if rel_text_l:
# Add elements of list to another list directly
results.extend(rel_text_l)
# Making list of the set of list to remove duplicates
final_results = list(set(results))
print(results)
print(final_results)
The results are
['$GME', '$FOOBAR', '$FOO', '$SNDL', '$FUBO', '$AMC', '$GME', '$LOTZ', '$CLOV', '$USAS', '$GOBLIN', '$LTNC']
['$LTNC', '$GOBLIN', '$AMC', '$FOO', '$FOOBAR', '$LOTZ', '$CLOV', '$SNDL', '$GME', '$USAS', '$FUBO']
Notice that $GME is removed once in final_results
If you were not bothered about remove tweets starting with RT, all this could be achieved in one line of code.
direct_result = list(set(re.findall(r"\$[a-zA-Z]+", str(df['TWEET']))))
Related
Is there a way where I can replace the first '.' with '-' in my code for a domain generator
last time I've gotten some help on making a website name generator. I feel bad but i'm stuck at the moment and I need some help again to improve it. in my code there's a .txt file called combined which included these lines. After that i created a variable to add to the domain web = 'web' suffix = 'co.id' And then i write it out so that the it would print the line output to the Combined.txt output_count = 50 subdomain_count = 2 for i in range(output_count): out = [] for j in range(subdomain_count): out.append(random.choice(Test)) out.append(web) out.append(suffix) Example.write('.'.join(out)+"\n") with open("dictionaries/examples.txt") as f: websamples = [line.rstrip() for line in f] I want the output where instead of just login.download.web.co.id there would be more variety like login-download.web.co.id or login.download-web.co.id In the code i used Example.write('.'.join(out)+"\n") so that the. would be a separator for each characters. I was thinking of adding more, by making a similar code line and save it to a different .txt files but I feel like it would be too long. Is there a way where I can variate each character separation with this symbol - or _ instead of just a . in the output? Thanks!
Sure just iterate through a list of delimiters to add each of them to the output. web = 'web' suffix = 'co.id' output_count = 50 subdomain_count = 2 delimeters = [ '-', '.'] for i in range(output_count): out = [] for j in range(subdomain_count): out.append(random.choice(Test)) for delimeter in delimeters: addr = delimeter.join(out) addrs = '.'.join([addr, web, suffix]) print(addrs) Example.write(addrs + '\n') output my_pay.web.co.id my-pay.web.co.id my.pay.web.co.id pay_download.web.co.id pay-download.web.co.id pay.download.web.co.id group_login.web.co.id group-login.web.co.id group.login.web.co.id install_group.web.co.id install-group.web.co.id install.group.web.co.id ... ... update import itertools Test = ['download', 'login', 'my', 'ip', 'site', 'ssl', 'pay', 'install'] delimeters = [ '-', '.'] web = 'web' suffix = 'co.id' output_count = 50 subdomain_count = 2 for combo in itertools.combinations(Test, 2): out = '' for i, d in enumerate(delimeters): out = d.join(combo) out = delimeters[i-1].join([out, web]) addr = '.'.join([out, suffix]) print(addr) # Example.write(addr+'\n') output download-login.web.co.id download.login-web.co.id download-my.web.co.id download.my-web.co.id download-ip.web.co.id download.ip-web.co.id download-site.web.co.id download.site-web.co.id download-ssl.web.co.id download.ssl-web.co.id download-pay.web.co.id download.pay-web.co.id download-install.web.co.id download.install-web.co.id login-my.web.co.id login.my-web.co.id login-ip.web.co.id login.ip-web.co.id login-site.web.co.id login.site-web.co.id login-ssl.web.co.id login.ssl-web.co.id login-pay.web.co.id login.pay-web.co.id login-install.web.co.id login.install-web.co.id my-ip.web.co.id my.ip-web.co.id my-site.web.co.id my.site-web.co.id my-ssl.web.co.id my.ssl-web.co.id my-pay.web.co.id my.pay-web.co.id my-install.web.co.id my.install-web.co.id ip-site.web.co.id ip.site-web.co.id ip-ssl.web.co.id ip.ssl-web.co.id ip-pay.web.co.id ip.pay-web.co.id ip-install.web.co.id ip.install-web.co.id site-ssl.web.co.id site.ssl-web.co.id site-pay.web.co.id site.pay-web.co.id site-install.web.co.id site.install-web.co.id ssl-pay.web.co.id ssl.pay-web.co.id ssl-install.web.co.id ssl.install-web.co.id pay-install.web.co.id pay.install-web.co.id
As an alternative of replacing the final output, you could make the seperator random: import random seperators = ['-', '_', '.'] Example.write(random.choice(seperators).join(out)+"\n")
In order to ensure compliance with RFC 1035 I would suggest: from random import choices as CHOICES, choice as CHOICE output_count = 50 subdomain_count = 2 web = 'web' suffix = 'co.id' dotdash = '.-' filename = 'output.txt' Test = [ 'auth', 'access', 'account', 'admin' # etc ] with open(filename, 'w') as output: for _ in range(output_count): sd = CHOICE(dotdash).join(CHOICES(Test, k=subdomain_count)) print('.'.join((sd, web, suffix)), file=output)
multiple separator in a string python
text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical." I have this kind of string. I am facing the problem which splits it to 2 lists. Output will be approximately like this : name = ['Brand','Color','Type','Power Source'] value = ['Smart Plane','Yellow','Sandwich Maker','Electrical'] Is there any solution for this.
name = [] value = [] text = text.split('.#/') for i in text: i = i.split('.*/') name.append(i[0]) value.append(i[1])
This is one approach using re.split and list slicing. Ex: import re text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical." data = [i for i in re.split("[^A-Za-z\s]+", text) if i] name = data[::2] value = data[1::2] print(name) print(value) Output: ['Brand', 'Color', 'Type', 'Power Source'] ['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical']
You can use regex to split the text, and populate the lists in a loop. Using regex you protect your code from invalid input. import re name, value = [], [] for ele in re.split(r'\.#\/', text): k, v = ele.split('.*/') name.append(k) value.append(v) >>> print(name, val) ['Brand', 'Color', 'Type', 'Power Source'] ['Smart Planet', 'Yellow', 'Sandwich Maker', 'Electrical.']
text="Brand.*/Smart Planet.#/Color.*/Yellow.#/Type.*/Sandwich Maker.#/Power Source.*/Electrical." name=[] value=[] word='' for i in range(len(text)): temp=i if text[i]!='.' and text[i]!='/' and text[i]!='*' and text[i]!='#': word=word+''.join(text[i]) elif temp+1<len(text) and temp+2<=len(text): if text[i]=='.' and text[temp+1]=='*' and text[temp+2]=='/': name.append(word) word='' elif text[i]=='.' and text[temp+1]=='#' and text[temp+2]=='/': value.append(word) word='' else: value.append(word) print(name) print(value) this will be work...
If in statement problems
I am attempting to make a plan, which is a list of classes that can only be added when the required classes have been completed or the co-requisite classes are being taken in the same semester. Below I have my code that almost works but it always reuses the classes even though they have already been completed/used. I tried to prevent this with and (class_list[i][0] not in classes_done), I was hoping that it wouldn't go into the if statement but it seems like it's being ignored. The rest of this if statement seems to work fine. (class_list[i][3] == '' or class_list[i][3] in classes_done) does this class have a required completed class if yes has it been completed? (class_list[i][2] in classes_for_semester or class_list[i][2] == '')does this class have a co-requisite class if yes is it in the class_for_semester or already completed? The class_list variable is organized like this['name', 'credit', 'co-requisite', 'required completed classes', 'empty']. I added the other variables as comments to show what they look like. class PlanGenerator: def generator(max_credit_allowed, min_credit_allowed, classes_done, class_list): classes_for_semester = [] credits_for_semester = 0 semester = 0 full_plan = [] # class_list = [['MA 241 ', '4', '', '', ''], ['PS 150 ', '3', 'MA 241 ', '', ''], ['UNIV 101', '1', '', '', ''], ['COM 122', '3', '', '', ''], ...] # max_credit_allowed = 16 # min_credit_allowed = 12 # classes_done=['UNIV 101'] while len(classes_done) != len(class_list): # keep going until all classes are used while int(min_credit_allowed) > credits_for_semester: # keep going until at least the minimum credits are in the semester semester += 1 for i in range(len(class_list)): # looping over the class list if int(class_list[i][1]) + credits_for_semester < max_credit_allowed: #if this class was to be added would it go over the max credit for semester if yes go to next class if (class_list[i][3] == '' or class_list[i][3] in classes_done) and (class_list[i][2] in classes_for_semester or class_list[i][2] in classes_done or class_list[i][2] == '') and (class_list[i][0] not in classes_done): classes_for_semester.append(class_list[i][0]) credits_for_semester += int(class_list[i][1]) print('classes for semester', classes_for_semester) print('semester credits', credits_for_semester) classes_done.append(classes_for_semester) full_plan.append(semester) full_plan.append(classes_for_semester) print('full plan', full_plan) classes_for_semester = [] credits_for_semester = 0 print('done') print(full_plan) I hope my explanation makes sense. Maybe somebody can understand my mistake and help me find a good solution. Also if you have anything that you see would make this code more simple please let me know. Much appreciated
First, your while int(min_credit_allowed) > credits_for_semester line is leading to an infinite loop. It needs to be changed to while len(classes_done) != len(class_list) and int(min_credit_allowed) > credits_for_semester: # Remove the second while loop Secondly, you're appending a list to a list, so you get a 2-D list for classes_done with classes_done.append(classes_for_semester) This should be classes_done += classes_for_semester so that you add the items from classes_for_semester into classes_done, rather than adding a list. Your new code should look like this: def generator(max_credit_allowed, min_credit_allowed, classes_done, class_list): classes_for_semester = [] credits_for_semester = 0 semester = 0 full_plan = [] # class_list = [['MA 241 ', '4', '', '', ''], ['PS 150 ', '3', 'MA 241 ', '', ''], ['UNIV 101', '1', '', '', ''], ['COM 122', '3', '', '', ''], ...] # max_credit_allowed = 16 # min_credit_allowed = 12 # classes_done=['UNIV 101'] while len(classes_done) != len(class_list) and int(min_credit_allowed) > credits_for_semester: # keep going until at least the minimum credits are in the semester semester += 1 for i in range(len(class_list)): # looping over the class list if int(class_list[i][1]) + credits_for_semester < max_credit_allowed: #if this class was to be added would it go over the max credit for semester if yes go to next class if (class_list[i][3] == '' or class_list[i][3] in classes_done) and (class_list[i][2] in classes_for_semester or class_list[i][2] in classes_done or class_list[i][2] == '') and (class_list[i][0] not in classes_done): classes_for_semester.append(class_list[i][0]) credits_for_semester += int(class_list[i][1]) print('classes for semester', classes_for_semester) print('semester credits', credits_for_semester) classes_done += classes_for_semester full_plan.append(semester) full_plan.append(classes_for_semester) print('full plan', full_plan) classes_for_semester = [] credits_for_semester = 0 print('done') print(full_plan) I would highly recommend using None instead of '' for the non-existent values, that way you can do a simple value is None check instead of an equality check to an empty string. For the lists of class information you're passing in, I would change them to classes, dictionaries, or namedtuples (find out more about them here) so that you can easily refer to the values by name rather than numbers. class_list[i].class_name or class_list[i]['class_name'] are a lot easier to debug in the future than magic indices. You can even change your for loop to use the actual class details as a variable instead of i in range(len(class_list)) like so: for c in class_list: if int(c.credits) .... # Using a class or namedtuple approach as suggested above And one minor thing that probably isn't a huge issue but could become a concern if these lists were to grow long: consider using sets instead of lists for storing things like classes_done and classes_for_semester. It also prevents duplicates from being stored (assuming you don't want to store the same class more than once). To provide a concrete example of the namedtuple suggestion, you can do the following: from collections import namedtuple ClassList = namedtuple('ClassList', ['class_name', 'credits', 'coreq', 'prereq']) class_list = [ ClassList(class_name='MA 241', credits=4, coreq=None, prereq=None), ClassList(class_name='PS 150', credits=3, coreq='MA 241', prereq=None), # ... ] So your for loop becomes for c in class_list: if c.credits + credits_for_semester < max_credits_allowed: if (c.prereq is None or c.prereq in classes_done) and \ (c.coreq in classes_for_semester or c.coreq in classes_done or c.coreq is None) and \ (c.class_name not in classes_done): classes_for_semester.append(c.class_name) credits_for_semester += c.credits classes_done += classes_for_semester full_plan.append(semester) full_plan.append(classes_for_semester) classes_for_semester = [] credits_for_semester = 0
Python - How to count specific section in a list
I'm brand new to python and I'm struggling how to add certain sections of a cvs file in python. I'm not allowed to use "import cvs" I'm importing the TipJoke CVS file from https://vincentarelbundock.github.io/Rdatasets/datasets.html This is the only code I have so far that worked and I'm at a total loss on where to go from here. if __name__ == '__main__': from pprint import pprint from string import punctuation f = open("TipJoke.csv", "r") tipList = [] for line in f: #deletes the quotes line = line.replace('"', '') tipList.append(line) pprint(tipList[]) Output: [',Card,Tip,Ad,Joke,None\n', '1,None,1,0,0,1\n', '2,Joke,1,0,1,0\n', '3,Ad,0,1,0,0\n', '4,None,0,0,0,1\n', '5,None,1,0,0,1\n', '6,None,0,0,0,1\n', '7,Ad,0,1,0,0\n', '8,Ad,0,1,0,0\n', '9,None,0,0,0,1\n', '10,None,0,0,0,1\n', '11,None,1,0,0,1\n', '12,Ad,0,1,0,0\n', '13,None,0,0,0,1\n', '14,Ad,1,1,0,0\n', '15,Joke,1,0,1,0\n', '16,Joke,0,0,1,0\n', '17,Joke,1,0,1,0\n', '18,None,0,0,0,1\n', '19,Joke,0,0,1,0\n', '20,None,0,0,0,1\n', '21,Ad,1,1,0,0\n', '22,Ad,1,1,0,0\n', '23,Ad,0,1,0,0\n', '24,Joke,0,0,1,0\n', '25,Joke,1,0,1,0\n', '26,Joke,0,0,1,0\n', '27,None,1,0,0,1\n', '28,Joke,1,0,1,0\n', '29,Joke,1,0,1,0\n', '30,None,1,0,0,1\n', '31,Joke,0,0,1,0\n', '32,None,1,0,0,1\n', '33,Joke,1,0,1,0\n', '34,Ad,0,1,0,0\n', '35,Joke,0,0,1,0\n', '36,Ad,1,1,0,0\n', '37,Joke,0,0,1,0\n', '38,Ad,0,1,0,0\n', '39,Joke,0,0,1,0\n', '40,Joke,0,0,1,0\n', '41,Joke,1,0,1,0\n', '42,None,0,0,0,1\n', '43,None,0,0,0,1\n', '44,Ad,0,1,0,0\n', '45,None,0,0,0,1\n', '46,None,0,0,0,1\n', '47,Ad,0,1,0,0\n', '48,Joke,0,0,1,0\n', '49,Joke,1,0,1,0\n', '50,None,1,0,0,1\n', '51,None,0,0,0,1\n', '52,Joke,1,0,1,0\n', '53,Joke,1,0,1,0\n', '54,Joke,0,0,1,0\n', '55,None,1,0,0,1\n', '56,Ad,0,1,0,0\n', '57,Joke,0,0,1,0\n', '58,None,0,0,0,1\n', '59,Ad,0,1,0,0\n', '60,Joke,1,0,1,0\n', '61,Ad,0,1,0,0\n', '62,None,1,0,0,1\n', '63,Joke,0,0,1,0\n', '64,Ad,0,1,0,0\n', '65,Joke,0,0,1,0\n', '66,Ad,0,1,0,0\n', '67,Ad,0,1,0,0\n', '68,Ad,0,1,0,0\n', '69,None,0,0,0,1\n', '70,Joke,1,0,1,0\n', '71,None,1,0,0,1\n', '72,None,0,0,0,1\n', '73,None,0,0,0,1\n', '74,Joke,0,0,1,0\n', '75,Ad,1,1,0,0\n', '76,Ad,0,1,0,0\n', '77,Ad,1,1,0,0\n', '78,Joke,0,0,1,0\n', '79,Joke,0,0,1,0\n', '80,Ad,1,1,0,0\n', '81,Ad,0,1,0,0\n', '82,None,0,0,0,1\n', '83,Ad,0,1,0,0\n', '84,Joke,0,0,1,0\n', '85,Joke,0,0,1,0\n', '86,Ad,1,1,0,0\n', '87,None,1,0,0,1\n', '88,Joke,1,0,1,0\n', '89,Ad,0,1,0,0\n', '90,None,0,0,0,1\n', '91,None,0,0,0,1\n', '92,Joke,0,0,1,0\n', '93,Joke,0,0,1,0\n', '94,Ad,0,1,0,0\n', '95,Ad,0,1,0,0\n', '96,Ad,0,1,0,0\n', '97,Joke,1,0,1,0\n', '98,None,0,0,0,1\n', '99,None,0,0,0,1\n', '100,None,1,0,0,1\n', '101,Joke,0,0,1,0\n', '102,Joke,0,0,1,0\n', '103,Ad,1,1,0,0\n', '104,Ad,0,1,0,0\n', '105,Ad,0,1,0,0\n', '106,Ad,1,1,0,0\n', '107,Ad,0,1,0,0\n', '108,None,0,0,0,1\n', '109,Ad,0,1,0,0\n', '110,Joke,1,0,1,0\n', '111,None,0,0,0,1\n', '112,Ad,0,1,0,0\n', '113,Ad,0,1,0,0\n', '114,None,0,0,0,1\n', '115,Ad,0,1,0,0\n', '116,None,0,0,0,1\n', '117,None,0,0,0,1\n', '118,Ad,0,1,0,0\n', '119,None,1,0,0,1\n', '120,Ad,1,1,0,0\n', '121,Ad,0,1,0,0\n', '122,Ad,1,1,0,0\n', '123,None,0,0,0,1\n', '124,None,0,0,0,1\n', '125,Joke,1,0,1,0\n', '126,Joke,1,0,1,0\n', '127,Ad,0,1,0,0\n', '128,Joke,0,0,1,0\n', '129,Joke,0,0,1,0\n', '130,Ad,0,1,0,0\n', '131,None,0,0,0,1\n', '132,None,0,0,0,1\n', '133,None,0,0,0,1\n', '134,Joke,1,0,1,0\n', '135,Ad,0,1,0,0\n', '136,None,0,0,0,1\n', '137,Joke,0,0,1,0\n', '138,Ad,0,1,0,0\n', '139,Ad,0,1,0,0\n', '140,None,0,0,0,1\n', '141,Joke,0,0,1,0\n', '142,None,0,0,0,1\n', '143,Ad,0,1,0,0\n', '144,None,1,0,0,1\n', '145,Joke,0,0,1,0\n', '146,Ad,0,1,0,0\n', '147,Ad,0,1,0,0\n', '148,Ad,0,1,0,0\n', '149,Joke,1,0,1,0\n', '150,Ad,1,1,0,0\n', '151,Joke,1,0,1,0\n', '152,None,0,0,0,1\n', '153,Ad,0,1,0,0\n', '154,None,0,0,0,1\n', '155,None,0,0,0,1\n', '156,Ad,0,1,0,0\n', '157,Ad,0,1,0,0\n', '158,Joke,0,0,1,0\n', '159,None,0,0,0,1\n', '160,Joke,1,0,1,0\n', '161,None,1,0,0,1\n', '162,Ad,1,1,0,0\n', '163,Joke,0,0,1,0\n', '164,Joke,0,0,1,0\n', '165,Ad,0,1,0,0\n', '166,Joke,1,0,1,0\n', '167,Joke,1,0,1,0\n', '168,Ad,0,1,0,0\n', '169,Joke,1,0,1,0\n', '170,Joke,0,0,1,0\n', '171,Ad,0,1,0,0\n', '172,Joke,0,0,1,0\n', '173,Joke,0,0,1,0\n', '174,Ad,0,1,0,0\n', '175,None,0,0,0,1\n', '176,Joke,1,0,1,0\n', '177,Ad,0,1,0,0\n', '178,Joke,0,0,1,0\n', '179,Joke,0,0,1,0\n', '180,None,0,0,0,1\n', '181,None,0,0,0,1\n', '182,Ad,0,1,0,0\n', '183,None,0,0,0,1\n', '184,None,0,0,0,1\n', '185,None,0,0,0,1\n', '186,None,0,0,0,1\n', '187,Ad,0,1,0,0\n', '188,None,1,0,0,1\n', '189,Ad,0,1,0,0\n', '190,Ad,0,1,0,0\n', '191,Ad,0,1,0,0\n', '192,Joke,1,0,1,0\n', '193,Joke,0,0,1,0\n', '194,Ad,0,1,0,0\n', '195,None,0,0,0,1\n', '196,Joke,1,0,1,0\n', '197,Joke,0,0,1,0\n', '198,Joke,1,0,1,0\n', '199,Ad,0,1,0,0\n', '200,None,0,0,0,1\n', '201,Joke,1,0,1,0\n', '202,Joke,0,0,1,0\n', '203,Joke,0,0,1,0\n', '204,Ad,0,1,0,0\n', '205,None,0,0,0,1\n', '206,Ad,0,1,0,0\n', '207,Ad,0,1,0,0\n', '208,Joke,0,0,1,0\n', '209,Ad,0,1,0,0\n', '210,Joke,0,0,1,0\n', '211,None,0,0,0,1\n'] I'm currently trying to find the Total number of entries of the specified card type and the Percentage of tips given for the specified card type with two decimal places of precision. The tip column is the 0 or 1 right after the card type (None, Ad, Joke).
if you are allowed with pandas library then import pandas as pd df = pd.read_csv("TipJoke.csv") df is a pandas dataframe object in which you can perform multiple filtering task according to your need. for example if you want to get data for Joke you can filter like this: print(df[df["Card"] == "Joke"]) Though, i'm just providing you the direction , not whole logic for your question.
This works from pprint import pprint from string import punctuation counts = {"Joke": 0, "Ad": 0, "None": 0} with open("TipJoke.csv", "r") as f: for line in f: line_clean = line.replace('"', "").replace("\n", "").split(",") try: counts[line_clean[1]] += int(line_clean[2]) except: pass print(counts)
Finding the values and keys from dictionaries and verifying them
Given a dictionary: data = [{'id':'1234','name':'Jason','pw':'*sss*'}, {'id':'2345','name':'Tom','pw': ''}, {'id':'3456','name':'Art','pw': ''}, {'id':'2345','name':'Tom','pw':'*sss*'}] I need to find that the always pw contains '' or *sss*. I tried doing this: for d in data: if d['pw'] == ['*sss*' or ''] print "pw verified and it is '*sss*' or '' " else: print "pw is not any of two'*sss*' or ''" Please help me to complete this. I need to find that the always pw contains ' ' or '*sss*'. If possible I need to do it in a single line.
['*sss*' or ''] returns ['*sss*'] because '' is False and *sss* is considered True. That means your list reads as [True or False]. And the True factor is chosen (in this case, the *sss*. You probably meant to do something like: if d['pw'] in ['*sss*', '']: Or even: if d['pw'] == '*sss*' or d['pw'] == '': As a one liner (kinda): >>> for res in ("pw verified and it is '*sss*' or '' " if i['pw'] in ['*sss', ''] else "pw is not any of two'*sss*' or ''" for i in data): ... print res ... pw is not any of two'*sss*' or '' pw verified and it is '*sss*' or '' pw verified and it is '*sss*' or '' pw is not any of two'*sss*' or ''
Use set to do it in one single line. ans = {d['pw'] for d in data}.issubset({'','*sss*'}) ans is True if d['pw'] is always '' or '*sss*' else False
If you're looking for a one liner, use the all() function. >>> data = [{'id':'1234','name':'Jason','pw':'*sss*'}, {'id':'2345','name':'Tom','pw': ''}, {'id':'3456','name':'Art','pw': ''}, {'id':'2345','name':'Tom','pw':'*sss*'}] >>> all(elem['pw'] in ('', '*sss*') for elem in data) True For the if condition. >>> "pw verified" if all(elem['pw'] in ('', '*sss*') for elem in data) else "pw not verified" 'pw verified'