How to replace string in order - python

I want to replace some value in order
for example, below is a sample of xpath
/MCCI_IN200100UV01[#ITSVersion='XML_1.0'][#xsi:schemaLocation='urn:hl7-org:v3 MCCI_IN200100UV01.xsd']
/PORR_IN049016UV[r]/controlActProcess[#classCode='CACT']
[#moodCode='EVN']/subject[#typeCode='SUBJ'][1]/investigationEvent[#classCode='INVSTG']
[#moodCode='EVN']/outboundRelationship[#typeCode='SPRT'][relatedInvestigation/code[#code='2']
[#codeSystem='2.16.840.1.113883.3.989.2.1.1.22']][r]/relatedInvestigation[#classCode='INVSTG']
[#moodCode='EVN']/subjectOf2[#typeCode='SUBJ']/controlActEvent[#classCode='CACT']
[#moodCode='EVN']/author[#typeCode='AUT']/assignedEntity[#classCode='ASSIGNED']/assignedPerson[#classCode='PSN']
[#determinerCode='INSTANCE']/name/prefix[1]/#nullFlavor",
and, I would like to extract [r] in order and to replace from [0] to [n] depending on the number of elements.
how can I replace [r] ?

const txt = `/MCCI_IN200100UV01[#ITSVersion='XML_1.0'][#xsi:schemaLocation='urn:hl7-org:v3 MCCI_IN200100UV01.xsd']
/PORR_IN049016UV[r]/controlActProcess[#classCode='CACT']
[#moodCode='EVN']/subject[#typeCode='SUBJ'][1]/investigationEvent[#classCode='INVSTG']
[#moodCode='EVN']/outboundRelationship[#typeCode='SPRT'][relatedInvestigation/code[#code='2']
[#codeSystem='2.16.840.1.113883.3.989.2.1.1.22']][r]/relatedInvestigation[#classCode='INVSTG']
[#moodCode='EVN']/subjectOf2[#typeCode='SUBJ']/controlActEvent[#classCode='CACT']
[#moodCode='EVN']/author[#typeCode='AUT']/assignedEntity[#classCode='ASSIGNED']/assignedPerson[#classCode='PSN']
[#determinerCode='INSTANCE']/name/prefix[1]/#nullFlavor",`;
const count = (txt.match(/\[r\]/g) || []).length; // count occurrences using RegExp
let replacements; // set replacement values in-order
switch (count) {
case 0:
break
case 1:
replacements = ["a"];
break;
case 2:
replacements = ["___REPLACEMENT_1___", "___REPLACEMENT_2___"];
break;
case 3:
replacements = ["d", "e", "f"];
break;
}
let out = txt; // output variable
for (let i = 0; i < count; i++) {
out = out.replace("[r]", replacements[i], 1); // replace each occurrence one at a time
}
console.log(out);

With str.replace(). For example:
>>> 'test[r]test'.replace('[r]', '[0]')
'test[0]test'
Here's the docs on it.

Related

Count of string2 in string1 not working in C, but works in Python

The problem itself is simple. I have to count the number of occurence of s2 in s1.
And length of s2 is always 2. I tried to implement it with C, but it did not work even though i know the logic is correct. So i tried the same logic in pyhton and it works perfectly. Can someone explain why? Or did i do anything wrong in C. I given both codes below.
C
#include<stdio.h>
#include<string.h>
int main()
{
char s1[100],s2[2];
int count = 0;
gets(s1);
gets(s2);
for(int i=0;i<strlen(s1);i++)
{
if(s1[i] == s2[0] && s1[i+1] == s2[1])
{
count++;
}
}
printf("%d",count);
return 0;
}
Python
s1 = input()
s2 = input()
count = 0
for i in range(0,len(s1)):
if(s1[i] == s2[0] and s1[i+1] == s2[1]):
count = count+1
print(count)
Your python code is actually incorrect, it would raise an IndexError if the last character of s1 matches the first of s2.
You have to stop iterating on the second to last character of s1.
Here is a generic solution working for any length of s2:
s1 = 'abaccabaabaccca'
s2 = 'aba'
count = 0
for i in range(len(s1)-len(s2)+1):
if s2 == s1[i:i+len(s2)]:
count += 1
print(count)
output: 3
First, as others have pointed out, you do not want to use gets(), try using fgets(). Otherwise, your logic is correct but when you read in the input, the new line character will be included in the string.
If you were to input test and es, your strings will contain test\n and es\n (with both respectively containing the null terminating byte \0). Then leads to you searching the string test\n for the substring es\n which it will not find. So you must first remove the new line character from, at least, the substring you want to search for which you can do with strcspn() to give you es.
Once the trailing newline (\n) has been replaced with a null terminating byte. You can search the string for occurances.
#include<stdio.h>
#include<string.h>
int main() {
char s1[100], s2[4];
int count = 0;
fgets(s1, 99, stdin);
fgets(s2, 3, stdin);
s1[strcspn(s1, "\n")] = '\0';
s2[strcspn(s2, "\n")] = '\0';
for(int i=0;i < strlen(s1) - 1;i++) {
if(s1[i] == s2[0] && s1[i+1] == s2[1]) {
count++;
}
}
printf("%d\n",count);
return 0;
}

Is stripping string by '\r\n ' necessary in Python?

In Java, it's necessary to strip with \r\n, e.g. split( "\r\n") is not splitting my string in java
But is \r\n necessary in Python? Is the following true?
str.strip() == str.strip('\r\n ')
From the docs:
Return a copy of the string with the leading and trailing characters
removed. The chars argument is a string specifying the set of
characters to be removed. If omitted or None, the chars argument
defaults to removing whitespace. The chars argument is not a prefix or
suffix; rather, all combinations of its values are stripped
From this CPython test, str.strip() seems to be stripping:
\t\n\r\f\v
Anyone can point me to the code in CPython that does the string stripping?
Are you looking for these lines?
https://github.com/python/cpython/blob/e42b705188271da108de42b55d9344642170aa2b/Objects/unicodeobject.c#L12222-L12247
#define LEFTSTRIP 0
#define RIGHTSTRIP 1
#define BOTHSTRIP 2
/* Arrays indexed by above */
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
#define STRIPNAME(i) (stripfuncnames[i])
/* externally visible for str.strip(unicode) */
PyObject *
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
{
void *data;
int kind;
Py_ssize_t i, j, len;
BLOOM_MASK sepmask;
Py_ssize_t seplen;
if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
return NULL;
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
len = PyUnicode_GET_LENGTH(self);
seplen = PyUnicode_GET_LENGTH(sepobj);
sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
PyUnicode_DATA(sepobj),
seplen);
i = 0;
if (striptype != RIGHTSTRIP) {
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
i++;
}
}
j = len;
if (striptype != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, j);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
j--;
}
j++;
}
return PyUnicode_Substring(self, i, j);
}
Essentially:
str.strip() == str.strip(string.whitespace) == str.strip(' \t\n\r\f\v') != str.strip('\r\n')
Unless you are explicitly trying to remove ONLY newline characters, str.strip() and str.strip('\r\n') are different.
>>> '\nfoo\n'.strip()
'foo'
>>> '\nfoo\n'.strip('\r\n')
'foo'
>>> '\r\n\r\n\r\nfoo\r\n\r\n\r\n'.strip()
'foo'
>>> '\r\n\r\n\r\nfoo\r\n\r\n\r\n'.strip('\r\n')
'foo'
>>> '\n\tfoo\t\n'.strip()
'foo'
>>> '\n\tfoo\t\n'.strip('\r\n')
'\tfoo\t'
This all seems fine, but note that if there is whitespace (or any other character) between a newline and the start or end of a string, .strip('\r\n') won't remove the newline.
>>> '\t\nfoo\n\t'.strip()
'foo'
>>> '\t\nfoo\n\t'.strip('\r\n')
'\t\nfoo\n\t'

Remove consecutive duplicate characters of even count from the given string

I'm trying to solve a problem where I get the string as input and then delete the duplicate characters of even count.
Input:azxxzyyyddddyzzz
Output: azzz
can you help me with this.
My Attempt is working fine for removing duplicate characters but I'm stuck at how to remove duplicate characters of even count
# Utility function to convert string to list
def toMutable(string):
temp = []
for x in string:
temp.append(x)
return temp
# Utility function to convert string to list
def toString(List):
return ''.join(List)
# Function to remove duplicates in a sorted array
def removeDupsSorted(List):
res_ind = 1
ip_ind = 1
# In place removal of duplicate characters
while ip_ind != len(List):
if List[ip_ind] != List[ip_ind-1]:
List[res_ind] = List[ip_ind]
res_ind += 1
ip_ind+=1
# After above step string is efgkorskkorss.
# Removing extra kkorss after string
string = toString(List[0:res_ind])
return string
# Function removes duplicate characters from the string
# This function work in-place and fills null characters
# in the extra space left
def removeDups(string):
# Convert string to list
List = toMutable(string)
# Sort the character list
List.sort()
# Remove duplicates from sorted
return removeDupsSorted(List)
# Driver program to test the above functions
string = "geeksforgeeks"
print removeDups(string)
Here's an attempt with itertools.groupby. I'm not sure if it can be done with better time complexity.
from itertools import groupby
def rm_even(s):
to_join = []
for _, g in groupby(s):
chars = list(g)
if len(chars) % 2:
to_join.extend(chars)
if to_join == s:
return ''.join(to_join)
return rm_even(to_join)
Demo:
>>> rm_even('azxxzyyyddddyzzz')
>>> 'azzz'
>>> rm_even('xAAAAx')
>>> ''
Count the letters with Counter and remove the ones that have even count:
from collections import Counter
word = 'azxxzyyyddddyzzz'
count = Counter(word) # Counter({'z': 5, 'y': 4, 'd': 4, 'x': 2, 'a': 1})
for key, value in count.items():
if value%2 == 0:
word = word.replace(key, "")
print(word) # 'azzzzz'
def remove_even_dup(string):
spans = []
for idx, letter in enumerate(string):
if not len(spans) or spans[-1][0] != letter:
spans.append((letter, {idx}))
else:
spans[-1][1].add(idx)
# reverse the spans so we can use them as a stack
spans = list(reversed(spans))
visited = []
while len(spans):
letter, indexes = spans.pop()
if len(indexes) % 2 != 0:
visited.append((letter, indexes))
else:
# if we have any previous spans we might need to merge
if len(visited):
prev_letter, prev_indexes = visited[-1]
next_letter, next_indexes = spans[-1]
# if the previous one and the next one have the same letter, merge them
if prev_letter == next_letter:
# remove the old
visited.pop()
spans.pop()
# add the new to spans to be visited
spans.append((letter, prev_indexes | next_indexes))
to_keep = { idx for _, indexes in visited for idx in indexes }
return ''.join(letter for idx, letter in enumerate(string) if idx in to_keep)
I used Collection because it is easy to delete and we have to convert into the string.
import java.util.*;
public class RemoveEvenCount {
public static void main(String[] args) {
//String a="azxxzyyyddddyzzz";
String a="xAAAAx";
ArrayList a2=new ArrayList<>();
for(int i=0;i<a.length();i++)
{
a2.add(a.charAt(i));
}
a2=removeData(a2);
System.out.print(a2);
}
public static ArrayList removeData(ArrayList a2)
{
if(a2.size()==2)
{
if(a2.get(0)==a2.get(1))
return null;
}
for(int i=0;i<a2.size();i++)
{
int count =1;
for(int j=0;j<a2.size()-1;j++)
{
if(a2.get(j)==a2.get(j+1))
{
count++;
}else if(count%2==0)
{
for(int k=0;k<count;k++)
{
a2.remove(j-k);
}
return removeData(a2);
}
Count=1
}
}
return a2;
}
}

How to Store Column Values in a Variable

I am dealing with tab separated file that contains multiple columns. Each column contain more than ~3000 records.
Column1 Column2 Column3 Column4
1000041 11657 GenNorm albumin
1000043 24249 GenNorm CaBP
1000043 29177 GenNorm calcium-binding protein
1000045 2006 GenNorm tropoelastin
Problem: Using Python, How to read the tab separated file and store each column (with its record) in a single variable. Use "print" to print out a specific column(s)
Preliminary code: I used this code so far to read the tsv file
import csv
Dictionary1 = {}
with open("sample.txt", 'r') as samplefile:
reader = csv.reader(samplefile, delimiter="\t")
I think you're just asking how to "transpose" a CSV file from a sequence of rows to a sequence of columns.
In Python, you can always transpose any iterable of iterables by using the zip function:
with open("sample1.txt") as samplefile:
reader = csv.reader(samplefile, delimiter="\t")
columns = zip(*reader)
Now, if you want to print each column in order:
for column in columns:
print(column)
Here, columns is an iterator of tuples. If you want some other format, like a dict mapping the column names to a list of values, you can transform it easily. For example:
columns = {column[0]: list(column[1:]) for column in columns}
Or, if you want to put them in four separate variables, you can just use normal tuple unpacking:
col1, col2, col3, col4 = columns
But there doesn't seem to be a very good reason to do that.
Not sure the code in python but use this loop. Once you store everything into the dictionary then use this loop then use the function to call the index to print the method you can modify the function to suit what you want the key to be you can pass through a word to search etc
int mainCounter = 0;
int counter1 = 0;
string arrColumn1[3000];
int counter2 = 0;
string arrColumn1[3000];
int counter3 = 0;
string arrColumn1[3000];
int counter4 = 0;
string arrColumn1[3000];
for(int i = 0; i<dictionary.length; ++i){
switch ( mainCounterounter )
{
case 0:
arrColumn1[counter1] = dictionary[i];
++counter1;
++mainCounter;
break;
case 1:
arrColumn2[counter2] = dictionary[i];
++counter2;
++mainCounter;
break;
case 2:
arrColumn3[counter3] = dictionary[i];
++counter3;
++mainCounter;
break;
case 3:
arrColumn4[counter4] = dictionary[i];
++counter4;
mainCounter = 0;
break;
}
}
void printRecordFunction(int colToSearch, string findThis, string arr1[], string arr2[], string arr3[], string arr4[]){
int foundIndex=0;
if(colToSearch == 1){
for(int i = 0; i<arr1.length; ++i){
if(strcmp(arr1[i], findthis)==0){
foundIndex = i;
break;
}
}
}else if(colToSearch == 2){
for(int i = 0; i<arr2.length; ++i){
if(strcmp(arr2[i], findthis)==0){
foundIndex = i;
break;
}
}
}else if(colToSearch == 3){
for(int i = 0; i<arr3.length; ++i){
if(strcmp(arr3[i], findthis)==0){
foundIndex = i;
break;
}
}
}else if(colToSearch == 4){
for(int i = 0; i<arr4.length; ++i){
if(strcmp(arr4[i], findthis)==0){
foundIndex = i;
break;
}
}
}
count<<"Record: " << arr1[i] << " " << arr2[i] << " " << arr3[i] << " " << arr4[i] << endl;
}
Sorry this is all pretty hard code but I hope it gives you some idea and you can adjust it

How to efficiently filter a string against a long list of words in Python/Django?

Stackoverflow implemented its "Related Questions" feature by taking the title of the current question being asked and removing from it the 10,000 most common English words according to Google. The remaining words are then submitted as a fulltext search to find related questions.
I want to do something similar in my Django site. What is the best way to filter a string (the question title in this case) against a long list of words in Python? Any libraries that would enable me to do that efficiently?
You could do this very simply using the set and string functionality in Python and see how it performs (premature optimisation being the root of all evil!):
common_words = frozenset(("if", "but", "and", "the", "when", "use", "to", "for"))
title = "When to use Python for web applications"
title_words = set(title.lower().split())
keywords = title_words.difference(common_words)
print(keywords)
I think a much simpler solution and still reasonably fast is to use sqlite and regular expressions.
Put the long list of words in an sqlite table and build a b-tree index. This gives you log(n) time exists queries. Split the smaller string with a regular expression and loop over the words running an exists query for each of them.
You can stem the words first with the porter stemmer from nltk.
While I hate to discourage the use of something cool like a trie, have you thought about doing it in straight python? I wrote a simple benchmark using the corncob worlist and performance wasn't that bad.
import time
with open('corncob_lowercase.txt') as f:
filetime = 0
starttime = time.time()
words = f.read().split('\n')
endtime = time.time()
filetime = endtime - starttime
print "file opened in {0} seconds".format(filetime)
nonwords = ['234' + word for word in words]
totaltime = 0
for word in nonwords:
starttime = time.time()
word in words
endtime = time.time()
totaltime += endtime - starttime
wordcount = len(words)
avgtime = totaltime / wordcount
print "average time for word: {0}".format(avgtime)
print "with {0} words".format(wordcount)
runningtimes = (filetime + i * avgtime for i in xrange(10))
print "running times: {0}".format(list(runningtimes))
note that I'm testing the worst case where the word isn't in the file. I'm also including the time to load the file and process the file. If you were to memcache it, that would disappear. One further thing to note is that my machine is basically crap. C is fast but most of the code involved in searching a list is written in C anyways. Finally, this test is for pretty much every word in the english language. If you just want 10,000, I think that's cake.
file opened in 0.0135519504547 seconds
average time for word: 0.00249605141253
with 58113 words
running times: [0.013551950454711914, 0.016048001867237236, 0.018544053279762558,
0.021040104692287877, 0.023536156104813199, 0.026032207517338521, 0.028528258929863839,
0.031024310342389162, 0.033520361754914484, 0.036016413167439809]
I don't know which method is used by SO, but:
I suppose a fast (and very simplistic) way of doing this is by going back to C, and checking them one by one, maybe with a KMP algorithm.
Another (not so simple) way of doing this, is by keeping a trie with those 10.000 words and searching the text using that. This would be super-fast, but fairly hard to implement. If you are interested, I have a dummy implementation in C++.
EDIT
Looking back to it, I see I've only used fstream, so this could be modified easily for C, so you'll be able to integrate with python easily . This is the source:
#include <fstream>
using namespace std;
ifstream in("trie.in");
ofstream out("trie.out");
struct Trie
{
short nr, pref;
Trie *children[26], *father;
Trie()
{
int i;
nr = pref = 0;
for(i=0; i<26; i++)
children[i] = NULL;
father = NULL;
}
};
Trie t, *it, *it2;
int n, op, val, i, l, len;
char s[22],*p;
int main()
{
while(in>>op>>s)
{
p = s;
it = &t;
l = 0;len=0;
while(p[0] != '\0')
{
if(it->children[p[0] - 'a'] == NULL && op == 2)
{op=9; out<<"0\n"; break;}
if(it->children[p[0] - 'a'] == NULL && op == 3)
break;
if(it->children[p[0] - 'a'] == NULL)
it->children[p[0] - 'a'] = new Trie(), it->children[p[0] - 'a']->father = it,
it = it->children[p[0] - 'a'];
else
it = it->children[p[0] - 'a'];
if(op == 0)
++ it->pref;
else if(op == 1 && it->pref > 0)
-- it->pref;
else if(op == 3 && it->pref > 0)
l = p-s+1;
p++;
}
if(op == 0)
it->nr ++;
else if(op == 1 && it->nr > 0)
{
it->nr --;
l = strlen(s)-1;
while(it->pref == 0 && it != &t && l>=0)
{
it2 = it->father;
it2->children[s[l--] - 'a'] = NULL;
delete it;
it = it2;
}
}
else if(op == 2)
out<<it->nr<<'\n';
else if(op == 3)
out<<l<<'\n';
}
return 0;
}
This takes in trie.in text formatted like this:
0 lat
0 mare
0 lac
2 la
0 mare
1 lat
0 ma
0 lung
3 latitudine
0 mari
2 mare
0 lat
0 mic
3 latime
2 lac
3 mire
And produces text like this
0
2
2
3
1
2
0 w - add the word w in the list (could be multiple times)
1 w - delete one record of the word w from the list (could be multiple times)
2 w - print how many w words are there in the list
3 w - print the length of the longest common prefix of w with any other word in the list
Oh, and sorry for the poor formatting, this was done for training.
If some false positives/negatives are ok, search for bloom filter on wikipedia.
If not look at CDB, (yum install tinycdb, in Fedora -- no python API atm).
How about using python's very nice filter method:
common_words = set(("if", "but", "and", "the", "when", "use", "to", "for"))
title = "When to use Python for web applications"
print filter(lambda word: word not in common_words, title.lower().split())

Categories