diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..6691aea --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module code.bcarlin.xyz/go/rx + +go 1.17 + +require github.com/stretchr/testify v1.7.0 + +require ( + github.com/davecgh/go-spew v1.1.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..acb88a4 --- /dev/null +++ b/go.sum @@ -0,0 +1,11 @@ +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/rx.go b/rx.go new file mode 100644 index 0000000..fb7fe8c --- /dev/null +++ b/rx.go @@ -0,0 +1,217 @@ +// Package rx provides an expressive way to build regular expressions. +package rx + +import ( + "fmt" + "strings" +) + +// Regex represents a regular expression. +type Regex string + +const ( + // Any matches any character ("."). + Any = "." +) + +// +// CHARACTER CLASSES +// + +// In generates a character class composed by the concatenation of all arguments. +func (r Regex) In(rxs ...Regex) Regex { + r += "[" + + for _, rx := range rxs { + r += rx + } + + r += "]" + + return r +} + +// NotIn generates a negated character class composed by the concatenation of all arguments. +func (r Regex) NotIn(rxs ...Regex) Regex { + r += "[^" + + for _, rx := range rxs { + r += rx + } + + r += "]" + + return r +} + +// +// COMPOSITES +// + +// Then appends the given regexes to the current one. +func (r Regex) Then(rxs ...Regex) Regex { + for _, rx := range rxs { + r += rx + } + + return r +} + +// AnyOf appends an alternative to the regex. The resulting alternative +// matches any of the given regexes. +func (r Regex) AnyOf(rxs ...Regex) Regex { + r += "(?:" + + for i, rx := range rxs { + if i != 0 { + r += "|" + } + + r += rx + } + + r += ")" + + return r +} + +// +// QUANTIFIERS +// + +// ZeroOrMore appends the given regex with a "zero or more" quantifier ("*"), prefer more. +func (r Regex) ZeroOrMore(rx Regex) Regex { + return r + protectMultiChar(rx) + "*" +} + +// OneOrMore appends the given regex with a "one or more" quantifier ("+"), prefer more. +func (r Regex) OneOrMore(rx Regex) Regex { + return r + protectMultiChar(rx) + "+" +} + +// ZeroOrOne appends the given regex with a "one or more" quantifier ("?"), prefer more. +func (r Regex) ZeroOrOne(rx Regex) Regex { + return r + protectMultiChar(rx) + "?" +} + +// NTimes appends the given regex with an exact number of repeats ("{N}"). +func (r Regex) NTimes(n int, rx Regex) Regex { + return Regex(fmt.Sprintf("%s%s{%d}", r, protectMultiChar(rx), n)) +} + +// NOrMore appends the given regex with an minimum number of repeats ("{N,}"), prefer more. +func (r Regex) NOrMore(n int, rx Regex) Regex { + return Regex(fmt.Sprintf("%s%s{%d,}", r, protectMultiChar(rx), n)) +} + +// NUpToM appends the given regex with an minimum and maximum number of repeats +// ("{N,M}"), prefer more. +func (r Regex) NUpToM(n, m int, rx Regex) Regex { + return Regex(fmt.Sprintf("%s%s{%d,%d}", r, protectMultiChar(rx), n, m)) +} + +// ZeroOrMoreLazy appends the given regex with a "zero or more" quantifier ("*"), prefer more. +func (r Regex) ZeroOrMoreLazy(rx Regex) Regex { + return r.ZeroOrMore(rx) + "?" +} + +// OneOrMoreLazy appends the given regex with a "one or more" quantifier ("+"), prefer more. +func (r Regex) OneOrMoreLazy(rx Regex) Regex { + return r.OneOrMore(rx) + "?" +} + +// ZeroOrOneLazy appends the given regex with a "one or more" quantifier ("?"), prefer more. +func (r Regex) ZeroOrOneLazy(rx Regex) Regex { + return r.ZeroOrOne(rx) + "?" +} + +// NTimesLazy appends the given regex with an exact number of repeats ("{N}"). +func (r Regex) NTimesLazy(n int, rx Regex) Regex { + return r.NTimes(n, rx) + "?" +} + +// NOrMoreLazy appends the given regex with an minimum number of repeats ("{N,}"), prefer more. +func (r Regex) NOrMoreLazy(n int, rx Regex) Regex { + return r.NOrMore(n, rx) + "?" +} + +// NUpToMLazy appends the given regex with an minimum and maximum number of repeats +// ("{N,M}"), prefer more. +func (r Regex) NUpToMLazy(n, m int, rx Regex) Regex { + return r.NUpToM(n, m, rx) + "?" +} + +// +// UTILS +// + +//nolint:gomnd // do not want to create constant for every length +func protectMultiChar(rx Regex) Regex { + rxs := string(rx) + + // a, ., ... + isOneChar := len(rx) == 1 + + // \d, \b, ... + isEscape := len(rx) == 2 && rx[0] == '\\' + + // Octals \123 + isOctal := (2 <= len(rx) && len(rx) <= 4 && rx[0] == '\\' && areDigits(rxs[1:])) + + // literals \Q...\E + isLiteral := (len(rx) >= 4 && rxs[0:2] == "\\Q" && rxs[len(rx)-2:] == "\\E") + + if isOneChar || isEscape || isCharClass(rx) || isUnicodeClass(rx) || + isOctal || isHexa(rx) || isLiteral { + return rx + } + + return "(?:" + rx + ")" +} + +// checks if the string is a single hexa nomber: \x2F or \x{10FFFF}. +func isHexa(rx Regex) bool { + s := string(rx) + + return (len(s) == 4 && s[0:2] == "\\x" && areHexaDigits(s[2:])) || + (len(s) >= 5 && len(s) <= 10 && s[0] == '\\' && s[1] == 'x' && + s[2] == '{' && s[len(s)-1] == '}' && areHexaDigits(s[3:len(rx)-1])) +} + +// Checks if the regex is a single unicode character class: \pF and \PF, or +// \p{Greek} and \P{Greek}, but not \p{Greek}\p{Digit}. +func isUnicodeClass(rx Regex) bool { + s := string(rx) + + return (len(s) == 3 && (s[0:2] == "\\p" || s[0:2] == "\\P")) || + (len(s) >= 3 && (s[0:3] == "\\p{" || s[0:3] == "\\P{") && s[len(s)-1] == '}' && + !strings.ContainsRune(s[1:len(s)-2], '}')) +} + +// Checks if the regex is a single character class: [abc] but not [abc][def]. +func isCharClass(rx Regex) bool { + s := string(rx) + + return (len(s) >= 3 && s[0] == '[' && s[len(s)-1] == ']' && + !strings.ContainsRune(s[1:len(s)-2], ']')) +} + +func areDigits(s string) bool { + for _, c := range s { + if c <= '0' || '9' <= c { + return false + } + } + + return true +} + +func areHexaDigits(s string) bool { + for _, c := range s { + if !('0' <= c && c <= '9' || 'A' <= c && c <= 'F') { + return false + } + } + + return true +} diff --git a/rx_test.go b/rx_test.go new file mode 100644 index 0000000..b276ee0 --- /dev/null +++ b/rx_test.go @@ -0,0 +1,165 @@ +package rx + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestRegex(t *testing.T) { + a := Regex("a") + + t.Run("ZeroOrMore() method", func(t *testing.T) { + tcs := []struct { + rx Regex + expected string + }{ + {"a", "aa*"}, + {"ab", "a(?:ab)*"}, + {"[xyz]", "a[xyz]*"}, + {"[xyz][abc]", "a(?:[xyz][abc])*"}, + {"[^xyz]", "a[^xyz]*"}, + {`\d`, `a\d*`}, + {"[[:alpha:]]", "a[[:alpha:]]*"}, + {"[[:^alpha:]]", "a[[:^alpha:]]*"}, + {`\pN`, `a\pN*`}, + {`\p{Greek}`, `a\p{Greek}*`}, + {`\p{Greek}\p{Digit}`, `a(?:\p{Greek}\p{Digit})*`}, + {`\PN`, `a\PN*`}, + {`\P{Greek}`, `a\P{Greek}*`}, + {`\P{Greek}\P{Digit}`, `a(?:\P{Greek}\P{Digit})*`}, + {`\123`, `a\123*`}, + {`\12`, `a\12*`}, + {`\1`, `a\1*`}, + {`\1a3`, `a(?:\1a3)*`}, + {`\x7F`, `a\x7F*`}, + {`\x{10FFFF}`, `a\x{10FFFF}*`}, + {`\x{0000}`, `a\x{0000}*`}, + {`\x{1}`, `a\x{1}*`}, + {`\x{Z}`, `a(?:\x{Z})*`}, + {`\Q...\E`, `a\Q...\E*`}, + {`\Q\E`, `a\Q\E*`}, + } + + for _, tc := range tcs { + require.Equal(t, tc.expected, string(a.ZeroOrMore(tc.rx))) + } + }) +} + +func ExampleRegex_In() { + fmt.Println(Regex("a").In("0-9", "a-z")) + // Output: a[0-9a-z] +} + +func ExampleRegex_NotIn() { + fmt.Println(Regex("a").NotIn("0-9", "a-z")) + // Output: a[^0-9a-z] +} + +func ExampleRegex_Then() { + fmt.Println(Regex("a").Then("b", Any)) + // Output: ab. +} + +func ExampleRegex_AnyOf() { + fmt.Println(Regex("a").AnyOf("b", "c")) + // Output: a(?:b|c) +} + +func ExampleRegex_ZeroOrMore() { + fmt.Println(Regex("a").ZeroOrMore("b")) + fmt.Println(Regex("a").ZeroOrMore("abc")) + // Output: + // ab* + // a(?:abc)* +} + +func ExampleRegex_OneOrMore() { + fmt.Println(Regex("a").OneOrMore("b")) + fmt.Println(Regex("a").OneOrMore("abc")) + // Output: + // ab+ + // a(?:abc)+ +} + +func ExampleRegex_ZeroOrOne() { + fmt.Println(Regex("a").ZeroOrOne("b")) + fmt.Println(Regex("a").ZeroOrOne("abc")) + // Output: + // ab? + // a(?:abc)? +} + +func ExampleRegex_NTimes() { + fmt.Println(Regex("a").NTimes(3, "b")) + fmt.Println(Regex("a").NTimes(3, "abc")) + // Output: + // ab{3} + // a(?:abc){3} +} + +func ExampleRegex_NOrMore() { + fmt.Println(Regex("a").NOrMore(3, "b")) + fmt.Println(Regex("a").NOrMore(3, "abc")) + // Output: + // ab{3,} + // a(?:abc){3,} +} + +func ExampleRegex_NUpToM() { + fmt.Println(Regex("a").NUpToM(3, 5, "b")) + fmt.Println(Regex("a").NUpToM(3, 5, "abc")) + // Output: + // ab{3,5} + // a(?:abc){3,5} +} + +func ExampleRegex_ZeroOrMoreLazy() { + fmt.Println(Regex("a").ZeroOrMoreLazy("b")) + fmt.Println(Regex("a").ZeroOrMoreLazy("abc")) + // Output: + // ab*? + // a(?:abc)*? +} + +func ExampleRegex_OneOrMoreLazy() { + fmt.Println(Regex("a").OneOrMoreLazy("b")) + fmt.Println(Regex("a").OneOrMoreLazy("abc")) + // Output: + // ab+? + // a(?:abc)+? +} + +func ExampleRegex_ZeroOrOneLazy() { + fmt.Println(Regex("a").ZeroOrOneLazy("b")) + fmt.Println(Regex("a").ZeroOrOneLazy("abc")) + // Output: + // ab?? + // a(?:abc)?? +} + +func ExampleRegex_NTimesLazy() { + fmt.Println(Regex("a").NTimesLazy(3, "b")) + fmt.Println(Regex("a").NTimesLazy(3, "abc")) + // Output: + // ab{3}? + // a(?:abc){3}? +} + +func ExampleRegex_NOrMoreLazy() { + fmt.Println(Regex("a").NOrMoreLazy(3, "b")) + fmt.Println(Regex("a").NOrMoreLazy(3, "abc")) + // Output: + // ab{3,}? + // a(?:abc){3,}? +} + +func ExampleRegex_NUpToMLazy() { + fmt.Println(Regex("a").NUpToMLazy(3, 5, "b")) + fmt.Println(Regex("a").NUpToMLazy(3, 5, "abc")) + // Output: + // ab{3,5}? + // a(?:abc){3,5}? +}