316 lines
7.3 KiB
Go
316 lines
7.3 KiB
Go
// Package rx provides an expressive way to build regular expressions.
|
|
package rx
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// Flags changes the meaning of the regex.
|
|
type Flags uint
|
|
|
|
const (
|
|
// CaseInsensitive makes the regex case insensitive (flag "i" set).
|
|
CaseInsensitive Flags = 1 << iota
|
|
|
|
// CaseSensitive makes the regex case sensitive (flag "i" cleared,
|
|
// default behavior).
|
|
CaseSensitive
|
|
|
|
// MultiLine makes StartOfText and EndOfText match the beginning and
|
|
// the end of each line (flag "m" set).
|
|
MultiLine
|
|
|
|
// SingleLine makes StartOfText and EndOfText match the beginning and
|
|
// the end of the whole text (flag "m" cleared, default behavior).
|
|
SingleLine
|
|
|
|
// AnyNL makes Any match new lines(flag "s" set).
|
|
AnyNL
|
|
|
|
// AnyNoNL makes Any not match new lines (flag "s" cleared, default
|
|
// behavior).
|
|
AnyNoNL
|
|
|
|
// Ungreedy makes the quantifiers match the shortest text possible
|
|
// (flag "U" set).
|
|
Ungreedy
|
|
|
|
// Greedy makes the quantifiers match the longest text possible (flag
|
|
// "U" cleared, default behavior).
|
|
Greedy
|
|
)
|
|
|
|
// Regex represents a regular expression.
|
|
type Regex string
|
|
|
|
const (
|
|
// Any matches any character (".").
|
|
Any Regex = "."
|
|
|
|
nonGreedyOp Regex = "?"
|
|
)
|
|
|
|
//
|
|
// CHARACTER CLASSES
|
|
//
|
|
|
|
// In generates a character class composed by the concatenation of all
|
|
// arguments.
|
|
func (r Regex) In(rxs ...Regex) Regex {
|
|
return r + "[" + joinRxs(rxs, "") + "]"
|
|
}
|
|
|
|
// NotIn generates a negated character class composed by the concatenation
|
|
// of all arguments.
|
|
func (r Regex) NotIn(rxs ...Regex) Regex {
|
|
return r + "[^" + joinRxs(rxs, "") + "]"
|
|
}
|
|
|
|
//
|
|
// COMPOSITES
|
|
//
|
|
|
|
// Then appends the given regexes to the current one.
|
|
func (r Regex) Then(rxs ...Regex) Regex {
|
|
return r + joinRxs(rxs, "")
|
|
}
|
|
|
|
// AnyOf appends an alternative to the regex. The resulting alternative
|
|
// matches any of the given regexes.
|
|
func (r Regex) AnyOf(rxs ...Regex) Regex {
|
|
return r + "(?:" + joinRxs(rxs, "|") + ")"
|
|
}
|
|
|
|
//
|
|
// QUANTIFIERS
|
|
//
|
|
|
|
// ZeroOrMore appends the given regex with a "zero or more" quantifier
|
|
// ("*"), prefer more.
|
|
func (r Regex) ZeroOrMore(rx Regex) Regex {
|
|
return r + protectMultiChar(rx) + "*"
|
|
}
|
|
|
|
// OneOrMore appends the given regex with a "one or more" quantifier ("+"),
|
|
// prefer more.
|
|
func (r Regex) OneOrMore(rx Regex) Regex {
|
|
return r + protectMultiChar(rx) + "+"
|
|
}
|
|
|
|
// ZeroOrOne appends the given regex with a "one or more" quantifier ("?"),
|
|
// prefer more.
|
|
func (r Regex) ZeroOrOne(rx Regex) Regex {
|
|
return r + protectMultiChar(rx) + "?"
|
|
}
|
|
|
|
// NTimes appends the given regex with an exact number of repeats ("{N}").
|
|
func (r Regex) NTimes(n int, rx Regex) Regex {
|
|
return Regex(fmt.Sprintf("%s%s{%d}", r, protectMultiChar(rx), n))
|
|
}
|
|
|
|
// NOrMore appends the given regex with an minimum number of repeats
|
|
// ("{N,}"), prefer more.
|
|
func (r Regex) NOrMore(n int, rx Regex) Regex {
|
|
return Regex(fmt.Sprintf("%s%s{%d,}", r, protectMultiChar(rx), n))
|
|
}
|
|
|
|
// NUpToM appends the given regex with an minimum and maximum number of repeats
|
|
// ("{N,M}"), prefer more.
|
|
func (r Regex) NUpToM(n, m int, rx Regex) Regex {
|
|
return Regex(fmt.Sprintf("%s%s{%d,%d}", r, protectMultiChar(rx), n, m))
|
|
}
|
|
|
|
// ZeroOrMoreLazy appends the given regex with a "zero or more" quantifier
|
|
// ("*"), prefer more.
|
|
func (r Regex) ZeroOrMoreLazy(rx Regex) Regex {
|
|
return r.ZeroOrMore(rx) + nonGreedyOp
|
|
}
|
|
|
|
// OneOrMoreLazy appends the given regex with a "one or more" quantifier
|
|
// ("+"), prefer more.
|
|
func (r Regex) OneOrMoreLazy(rx Regex) Regex {
|
|
return r.OneOrMore(rx) + nonGreedyOp
|
|
}
|
|
|
|
// ZeroOrOneLazy appends the given regex with a "one or more" quantifier
|
|
// ("?"), prefer more.
|
|
func (r Regex) ZeroOrOneLazy(rx Regex) Regex {
|
|
return r.ZeroOrOne(rx) + nonGreedyOp
|
|
}
|
|
|
|
// NTimesLazy appends the given regex with an exact number of repeats ("{N}").
|
|
func (r Regex) NTimesLazy(n int, rx Regex) Regex {
|
|
return r.NTimes(n, rx) + nonGreedyOp
|
|
}
|
|
|
|
// NOrMoreLazy appends the given regex with an minimum number of repeats
|
|
// ("{N,}"), prefer more.
|
|
func (r Regex) NOrMoreLazy(n int, rx Regex) Regex {
|
|
return r.NOrMore(n, rx) + nonGreedyOp
|
|
}
|
|
|
|
// NUpToMLazy appends the given regex with an minimum and maximum number of repeats
|
|
// ("{N,M}"), prefer more.
|
|
func (r Regex) NUpToMLazy(n, m int, rx Regex) Regex {
|
|
return r.NUpToM(n, m, rx) + nonGreedyOp
|
|
}
|
|
|
|
//
|
|
// GROUPS
|
|
//
|
|
|
|
// Capture appends a capture group to the regexes. The regexes given as
|
|
// arguments are just concatenated.
|
|
func (r Regex) Capture(rxs ...Regex) Regex {
|
|
return r + "(" + joinRxs(rxs, "") + ")"
|
|
}
|
|
|
|
// CaptureName appends a named capture group to the regexes. The regexes
|
|
// given as arguments are just concatenated.
|
|
func (r Regex) CaptureName(name string, rxs ...Regex) Regex {
|
|
return r + "(?P<" + Regex(name) + ">" + joinRxs(rxs, "") + ")"
|
|
}
|
|
|
|
// WithFlags appends a non-capturing group with the given flags set to the
|
|
// regex. The regexes given as arguments are just concatenated.
|
|
func (r Regex) WithFlags(flags Flags, rxs ...Regex) Regex {
|
|
tmp := joinRxs(rxs, "")
|
|
|
|
if f := buildFlagString(flags); f != "" {
|
|
tmp = "(?" + Regex(f) + ":" + tmp + ")"
|
|
}
|
|
|
|
return r + tmp
|
|
}
|
|
|
|
//
|
|
// UTILS
|
|
//
|
|
|
|
//nolint:gomnd // do not want to create constant for every length
|
|
func protectMultiChar(rx Regex) Regex {
|
|
rxs := string(rx)
|
|
|
|
// a, ., ...
|
|
isOneChar := len(rx) == 1
|
|
|
|
// \d, \b, ...
|
|
isEscape := len(rx) == 2 && rx[0] == '\\'
|
|
|
|
// Octals \123
|
|
isOctal := (2 <= len(rx) && len(rx) <= 4 && rx[0] == '\\' && areDigits(rxs[1:]))
|
|
|
|
// literals \Q...\E
|
|
isLiteral := (len(rx) >= 4 && rxs[0:2] == "\\Q" && rxs[len(rx)-2:] == "\\E")
|
|
|
|
if isOneChar || isEscape || isCharClass(rx) || isUnicodeClass(rx) ||
|
|
isOctal || isHexa(rx) || isLiteral {
|
|
return rx
|
|
}
|
|
|
|
return "(?:" + rx + ")"
|
|
}
|
|
|
|
// checks if the string is a single hexa nomber: \x2F or \x{10FFFF}.
|
|
func isHexa(rx Regex) bool {
|
|
s := string(rx)
|
|
|
|
return (len(s) == 4 && s[0:2] == "\\x" && areHexaDigits(s[2:])) ||
|
|
(len(s) >= 5 && len(s) <= 10 && s[0] == '\\' && s[1] == 'x' &&
|
|
s[2] == '{' && s[len(s)-1] == '}' && areHexaDigits(s[3:len(rx)-1]))
|
|
}
|
|
|
|
// Checks if the regex is a single unicode character class: \pF and \PF, or
|
|
// \p{Greek} and \P{Greek}, but not \p{Greek}\p{Digit}.
|
|
func isUnicodeClass(rx Regex) bool {
|
|
s := string(rx)
|
|
|
|
return (len(s) == 3 && (s[0:2] == "\\p" || s[0:2] == "\\P")) ||
|
|
(len(s) >= 3 && (s[0:3] == "\\p{" || s[0:3] == "\\P{") && s[len(s)-1] == '}' &&
|
|
!strings.ContainsRune(s[1:len(s)-2], '}'))
|
|
}
|
|
|
|
// Checks if the regex is a single character class: [abc] but not [abc][def].
|
|
func isCharClass(rx Regex) bool {
|
|
s := string(rx)
|
|
|
|
return (len(s) >= 3 && s[0] == '[' && s[len(s)-1] == ']' &&
|
|
!strings.ContainsRune(s[1:len(s)-2], ']'))
|
|
}
|
|
|
|
func areDigits(s string) bool {
|
|
for _, c := range s {
|
|
if c <= '0' || '9' <= c {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func areHexaDigits(s string) bool {
|
|
for _, c := range s {
|
|
if !('0' <= c && c <= '9' || 'A' <= c && c <= 'F') {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func buildFlagString(flags Flags) string {
|
|
set, cleared := "", ""
|
|
|
|
if flags&CaseInsensitive != 0 {
|
|
set += "i"
|
|
}
|
|
|
|
if flags&MultiLine != 0 {
|
|
set += "m"
|
|
}
|
|
|
|
if flags&AnyNL != 0 {
|
|
set += "s"
|
|
}
|
|
|
|
if flags&Ungreedy != 0 {
|
|
set += "U"
|
|
}
|
|
|
|
if flags&CaseSensitive != 0 {
|
|
cleared += "i"
|
|
}
|
|
|
|
if flags&SingleLine != 0 {
|
|
cleared += "m"
|
|
}
|
|
|
|
if flags&AnyNoNL != 0 {
|
|
cleared += "s"
|
|
}
|
|
|
|
if flags&Greedy != 0 {
|
|
cleared += "U"
|
|
}
|
|
|
|
if cleared != "" {
|
|
cleared = "-" + cleared
|
|
}
|
|
|
|
return set + cleared
|
|
}
|
|
|
|
func joinRxs(rxs []Regex, sep string) Regex {
|
|
var rv Regex
|
|
|
|
for i := range rxs {
|
|
if i != 0 {
|
|
rv += Regex(sep)
|
|
}
|
|
|
|
rv += rxs[i]
|
|
}
|
|
|
|
return rv
|
|
}
|