Javascript - string.split(regex) keep separators

I would like to split a string using a regular expression and have the separators / matched info included in the resulting array.

In java I used:

theString.split("(?<=[!><=}{])|(?=[!><=}{])|(?<= AND )|(?= AND )|(?<= OR )|(?= OR )")

But, javascript doesn't support lookbehind ?<=

For example I want string:

"Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes"

To split:

Reason,=,{,Existing problem, or ,fault,},{,Bestaande probleem of vout,},{,Other,},{,Ander,}, and ,Required,!,=,No, and ,Results,>,=,10, and ,Results,<,=,25, and ,Tst,>,5, and ,Tst,<,80, and ,Info,=,test this, or ,that, and ,those, and ,Success,!,=,Yes

Example of what I've got:

var thestr = "Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes";

document.write("::SPLIT::<br>");
var patt1=new RegExp(/([!><=}{])|( AND )|( OR ) /gi);

var x = thestr.split(patt1);
//This splits correctly but, doesn't include the separators / matched characters
document.write("length="+x.length+"<br>");
for (c=0;c<x.length;c++) {
    document.write(c+" - "+ x[c]+" |");
}

document.write("<br><br>::MATCH::<br>");

var y = thestr.match(patt1);

//This shows the matched characters but, how do I combine info from split and match
document.write("length="+y.length+"<br>");
for (d=0;d<y.length;d++) {
    document.write(d+" - "+ y[d]+" |");
}

document.write("<br><br>::INCLUDE SEPERATORS::<br>");
var patt2=new RegExp(/(?![!><=}{])|(?=[!><=}{])|(?! AND )|(?= AND )|(?! OR )|(?= OR ) /gi);
//This puts everything in the array, but, each character is a seperate array element.
// Not what I wanted to achieve.
var bits = thestr.split(patt2);
document.write("length="+bits.length+"<br>");
for (r=0;r<bits.length;r++) {
    document.write(r+" - "+ bits[r]+" |");
}

Answers:

Answer

If you put the whole pattern in a group, you will also get the separators:

thestr.split(/([!><=}{]| (?:AND|OR) )/)

This returns an array like:

["Reason", "=", "", "{", "Existing problem or fault", "}", "", "{", "Bestaande probleem of vout", "}", "", "{", "Other", "}", "", "{", "Ander", "}", " and Required", "!", "", "=", "No and Results ", ">", "", "=", "10 and Results ", "<", "", "=", "25 and Tst", ">", "5 and Tst", "<", "80 and Info", "=", "test this or that and those and Success", "!", "", "=", "Yes"]

Then you just need to filter the empty strings and you’re done:

thestr.split(/([!><=}{]| (?:AND|OR) )/).filter(Boolean)

Edit    Since Internet Explorer and possibly other browsers do not take a grouped separator into the result array, you could do this instead:

var matches = thestr.split(/(?:[!><=}{]| (?:AND|OR) )/),
    separators = thestr.match(/(?:[!><=}{]| (?:AND|OR) )/g);
for (var i=0; i<separators.length; ++i) {
    matches[i+1] = separators[i];
}

This basically separates the separators from the other parts and then combines both.

Answer

Not getting too deep into your query structure, I would suggest you to use replace method with a function as replacement which would collect the terms into an array:

function parse(sQuery) {
    var aParsed = [];
    var oReTerms = /.../gim;
    sQuery.replace(oReTerms, function($0, $1, $2, ...) {
        //...
        if ($1) {
            aParsed.append($1);
        }
        if ($2) {
            aParsed.append($2);
        }
        //...
        return $0; // return what was matched (or any string)
    });
    return aParsed;
}

I did this previously to parse HTML tags and attributes. I hope the idea is clear. You just need to define your regular expression so that it matches all terms in the query.

And you can have another replacing within the replacement function for specific cases.

Answer

I'm not sure about how JavaScript behaves if a regex split contains a capturing group. I know that in Python, a splitting delimiter becomes part of the match if it is enclosed in capturing parentheses.

Try

result = subject.split(/( or )|( and )|([^\w\s])\b|(?=[^\w\s])/i);

and see what happens.

Answer
function split2(str, re) {
    if (re.global) {
        // Reset to start of string
        re.lastIndex = 0;
    }
    var result = [];
    var match = re.exec(str);
    var lastEnd = 0;
    while (match != null) {
        if (match.index > lastEnd) {
            result.push(str.substring(lastEnd, match.index));
        }
        result.push(match[0]);
        lastEnd = match.index + match[0].length;
        match = re.exec(str);
    }
    result.push(str.substring(lastEnd));
    return result;
}

var thestr = "Reason={Existing problem or fault}{Bestaande probleem of vout}{Other}{Ander} and Required!=No and Results >=10 and Results <=25 and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes";

var patt = /[!><=}{]| AND | OR /gi;

split2(thestr,patt):

Output:

["Reason", "=", "{", "Existing problem", " or ", "fault", "}", "{",
"Bestaande probleem of vout", "}", "{", "Other", "}", "{", "Ander", "}", " and ",
"Required", "!", "=", "No", " and ", "Results ", ">", "=", "10", " and ",
"Results ", "<", "=", "25", " and ", "Tst", ">", "5", " and ", "Tst", "<", "80",
" and ", "Info", "=", "test this", " or ", "that", " and ", "those", " and ",
"Success", "!", "=", "Yes"]
Answer

Gumbo's split function above is a good idea but it doesn't work. It should be:

function split(str, regex) {
    var matches    = str.split(regex),
        separators = str.match(regex),
        ret        = [ matches[0] ];
    if (!separators) return ret;
    for (var i = 0; i < separators.length; ++i) {
        ret[2 * i + 1] = separators[i];
        ret[2 * i + 2] = matches[i + 1];
    }
    return ret;
}

split('a,b,c', /,/g); // returns ["a", ",", "b", ",", "c"]
Answer

To support most of the browsers in use, you can match your strings

this pattern matches any number of characters except the separators, <>!{}=, or one of the separators.

var rx=/([^<>!{}=]+|[<>!{}=])/g

var str='Reason={Existing problem or fault}{Bestaande probleem of vout}'+
'{Other}{Ander} and Required!=No and Results >=10 and Results <=25 '+
'and Tst>5 and Tst<80 and Info=test this or that and those and Success!=Yes';


str.match(rx).join('\n')

//returned value:
Reason
=
{
Existing problem or fault
}
{
Bestaande probleem of vout
}
{
Other
}
{
Ander
}
 and Required
!
=
No and Results 
>
=
10 and Results 
<
=
25 and Tst
>
5 and Tst
<
80 and Info
=
test this or that and those and Success
!
=
Yes

// I concatenated the string and joined the result for readability

Tags

Recent Questions

Top Questions

Home Tags Terms of Service Privacy Policy DMCA Contact Us Javascript

©2020 All rights reserved.