rx/rx.go

218 lines
5 KiB
Go
Raw Normal View History

2022-02-26 15:22:46 +01:00
// Package rx provides an expressive way to build regular expressions.
package rx
import (
"fmt"
"strings"
)
// Regex represents a regular expression.
type Regex string
const (
// Any matches any character (".").
Any = "."
)
//
// CHARACTER CLASSES
//
// In generates a character class composed by the concatenation of all arguments.
func (r Regex) In(rxs ...Regex) Regex {
r += "["
for _, rx := range rxs {
r += rx
}
r += "]"
return r
}
// NotIn generates a negated character class composed by the concatenation of all arguments.
func (r Regex) NotIn(rxs ...Regex) Regex {
r += "[^"
for _, rx := range rxs {
r += rx
}
r += "]"
return r
}
//
// COMPOSITES
//
// Then appends the given regexes to the current one.
func (r Regex) Then(rxs ...Regex) Regex {
for _, rx := range rxs {
r += rx
}
return r
}
// AnyOf appends an alternative to the regex. The resulting alternative
// matches any of the given regexes.
func (r Regex) AnyOf(rxs ...Regex) Regex {
r += "(?:"
for i, rx := range rxs {
if i != 0 {
r += "|"
}
r += rx
}
r += ")"
return r
}
//
// QUANTIFIERS
//
// ZeroOrMore appends the given regex with a "zero or more" quantifier ("*"), prefer more.
func (r Regex) ZeroOrMore(rx Regex) Regex {
return r + protectMultiChar(rx) + "*"
}
// OneOrMore appends the given regex with a "one or more" quantifier ("+"), prefer more.
func (r Regex) OneOrMore(rx Regex) Regex {
return r + protectMultiChar(rx) + "+"
}
// ZeroOrOne appends the given regex with a "one or more" quantifier ("?"), prefer more.
func (r Regex) ZeroOrOne(rx Regex) Regex {
return r + protectMultiChar(rx) + "?"
}
// NTimes appends the given regex with an exact number of repeats ("{N}").
func (r Regex) NTimes(n int, rx Regex) Regex {
return Regex(fmt.Sprintf("%s%s{%d}", r, protectMultiChar(rx), n))
}
// NOrMore appends the given regex with an minimum number of repeats ("{N,}"), prefer more.
func (r Regex) NOrMore(n int, rx Regex) Regex {
return Regex(fmt.Sprintf("%s%s{%d,}", r, protectMultiChar(rx), n))
}
// NUpToM appends the given regex with an minimum and maximum number of repeats
// ("{N,M}"), prefer more.
func (r Regex) NUpToM(n, m int, rx Regex) Regex {
return Regex(fmt.Sprintf("%s%s{%d,%d}", r, protectMultiChar(rx), n, m))
}
// ZeroOrMoreLazy appends the given regex with a "zero or more" quantifier ("*"), prefer more.
func (r Regex) ZeroOrMoreLazy(rx Regex) Regex {
return r.ZeroOrMore(rx) + "?"
}
// OneOrMoreLazy appends the given regex with a "one or more" quantifier ("+"), prefer more.
func (r Regex) OneOrMoreLazy(rx Regex) Regex {
return r.OneOrMore(rx) + "?"
}
// ZeroOrOneLazy appends the given regex with a "one or more" quantifier ("?"), prefer more.
func (r Regex) ZeroOrOneLazy(rx Regex) Regex {
return r.ZeroOrOne(rx) + "?"
}
// NTimesLazy appends the given regex with an exact number of repeats ("{N}").
func (r Regex) NTimesLazy(n int, rx Regex) Regex {
return r.NTimes(n, rx) + "?"
}
// NOrMoreLazy appends the given regex with an minimum number of repeats ("{N,}"), prefer more.
func (r Regex) NOrMoreLazy(n int, rx Regex) Regex {
return r.NOrMore(n, rx) + "?"
}
// NUpToMLazy appends the given regex with an minimum and maximum number of repeats
// ("{N,M}"), prefer more.
func (r Regex) NUpToMLazy(n, m int, rx Regex) Regex {
return r.NUpToM(n, m, rx) + "?"
}
//
// UTILS
//
//nolint:gomnd // do not want to create constant for every length
func protectMultiChar(rx Regex) Regex {
rxs := string(rx)
// a, ., ...
isOneChar := len(rx) == 1
// \d, \b, ...
isEscape := len(rx) == 2 && rx[0] == '\\'
// Octals \123
isOctal := (2 <= len(rx) && len(rx) <= 4 && rx[0] == '\\' && areDigits(rxs[1:]))
// literals \Q...\E
isLiteral := (len(rx) >= 4 && rxs[0:2] == "\\Q" && rxs[len(rx)-2:] == "\\E")
if isOneChar || isEscape || isCharClass(rx) || isUnicodeClass(rx) ||
isOctal || isHexa(rx) || isLiteral {
return rx
}
return "(?:" + rx + ")"
}
// checks if the string is a single hexa nomber: \x2F or \x{10FFFF}.
func isHexa(rx Regex) bool {
s := string(rx)
return (len(s) == 4 && s[0:2] == "\\x" && areHexaDigits(s[2:])) ||
(len(s) >= 5 && len(s) <= 10 && s[0] == '\\' && s[1] == 'x' &&
s[2] == '{' && s[len(s)-1] == '}' && areHexaDigits(s[3:len(rx)-1]))
}
// Checks if the regex is a single unicode character class: \pF and \PF, or
// \p{Greek} and \P{Greek}, but not \p{Greek}\p{Digit}.
func isUnicodeClass(rx Regex) bool {
s := string(rx)
return (len(s) == 3 && (s[0:2] == "\\p" || s[0:2] == "\\P")) ||
(len(s) >= 3 && (s[0:3] == "\\p{" || s[0:3] == "\\P{") && s[len(s)-1] == '}' &&
!strings.ContainsRune(s[1:len(s)-2], '}'))
}
// Checks if the regex is a single character class: [abc] but not [abc][def].
func isCharClass(rx Regex) bool {
s := string(rx)
return (len(s) >= 3 && s[0] == '[' && s[len(s)-1] == ']' &&
!strings.ContainsRune(s[1:len(s)-2], ']'))
}
func areDigits(s string) bool {
for _, c := range s {
if c <= '0' || '9' <= c {
return false
}
}
return true
}
func areHexaDigits(s string) bool {
for _, c := range s {
if !('0' <= c && c <= '9' || 'A' <= c && c <= 'F') {
return false
}
}
return true
}