// Package rx provides an expressive way to build regular expressions. package rx import ( "fmt" "strings" ) // Flags changes the meaning of the regex. type Flags uint const ( // CaseInsensitive makes the regex case insensitive (flag "i" set). CaseInsensitive Flags = 1 << iota // CaseSensitive makes the regex case sensitive (flag "i" cleared, // default behavior). CaseSensitive // MultiLine makes StartOfText and EndOfText match the beginning and // the end of each line (flag "m" set). MultiLine // SingleLine makes StartOfText and EndOfText match the beginning and // the end of the whole text (flag "m" cleared, default behavior). SingleLine // AnyNL makes Any match new lines(flag "s" set). AnyNL // AnyNoNL makes Any not match new lines (flag "s" cleared, default // behavior). AnyNoNL // Ungreedy makes the quantifiers match the shortest text possible // (flag "U" set). Ungreedy // Greedy makes the quantifiers match the longest text possible (flag // "U" cleared, default behavior). Greedy ) // Regex represents a regular expression. type Regex string const ( // Any matches any character ("."). Any Regex = "." nonGreedyOp Regex = "?" ) // // CHARACTER CLASSES // // In generates a character class composed by the concatenation of all // arguments. func (r Regex) In(rxs ...Regex) Regex { return r + "[" + joinRxs(rxs, "") + "]" } // NotIn generates a negated character class composed by the concatenation // of all arguments. func (r Regex) NotIn(rxs ...Regex) Regex { return r + "[^" + joinRxs(rxs, "") + "]" } // // COMPOSITES // // Then appends the given regexes to the current one. func (r Regex) Then(rxs ...Regex) Regex { return r + joinRxs(rxs, "") } // AnyOf appends an alternative to the regex. The resulting alternative // matches any of the given regexes. func (r Regex) AnyOf(rxs ...Regex) Regex { return r + "(?:" + joinRxs(rxs, "|") + ")" } // // QUANTIFIERS // // ZeroOrMore appends the given regex with a "zero or more" quantifier // ("*"), prefer more. func (r Regex) ZeroOrMore(rx Regex) Regex { return r + protectMultiChar(rx) + "*" } // OneOrMore appends the given regex with a "one or more" quantifier ("+"), // prefer more. func (r Regex) OneOrMore(rx Regex) Regex { return r + protectMultiChar(rx) + "+" } // ZeroOrOne appends the given regex with a "one or more" quantifier ("?"), // prefer more. func (r Regex) ZeroOrOne(rx Regex) Regex { return r + protectMultiChar(rx) + "?" } // NTimes appends the given regex with an exact number of repeats ("{N}"). func (r Regex) NTimes(n int, rx Regex) Regex { return Regex(fmt.Sprintf("%s%s{%d}", r, protectMultiChar(rx), n)) } // NOrMore appends the given regex with an minimum number of repeats // ("{N,}"), prefer more. func (r Regex) NOrMore(n int, rx Regex) Regex { return Regex(fmt.Sprintf("%s%s{%d,}", r, protectMultiChar(rx), n)) } // NUpToM appends the given regex with an minimum and maximum number of repeats // ("{N,M}"), prefer more. func (r Regex) NUpToM(n, m int, rx Regex) Regex { return Regex(fmt.Sprintf("%s%s{%d,%d}", r, protectMultiChar(rx), n, m)) } // ZeroOrMoreLazy appends the given regex with a "zero or more" quantifier // ("*"), prefer more. func (r Regex) ZeroOrMoreLazy(rx Regex) Regex { return r.ZeroOrMore(rx) + nonGreedyOp } // OneOrMoreLazy appends the given regex with a "one or more" quantifier // ("+"), prefer more. func (r Regex) OneOrMoreLazy(rx Regex) Regex { return r.OneOrMore(rx) + nonGreedyOp } // ZeroOrOneLazy appends the given regex with a "one or more" quantifier // ("?"), prefer more. func (r Regex) ZeroOrOneLazy(rx Regex) Regex { return r.ZeroOrOne(rx) + nonGreedyOp } // NTimesLazy appends the given regex with an exact number of repeats ("{N}"). func (r Regex) NTimesLazy(n int, rx Regex) Regex { return r.NTimes(n, rx) + nonGreedyOp } // NOrMoreLazy appends the given regex with an minimum number of repeats // ("{N,}"), prefer more. func (r Regex) NOrMoreLazy(n int, rx Regex) Regex { return r.NOrMore(n, rx) + nonGreedyOp } // NUpToMLazy appends the given regex with an minimum and maximum number of repeats // ("{N,M}"), prefer more. func (r Regex) NUpToMLazy(n, m int, rx Regex) Regex { return r.NUpToM(n, m, rx) + nonGreedyOp } // // GROUPS // // Capture appends a capture group to the regexes. The regexes given as // arguments are just concatenated. func (r Regex) Capture(rxs ...Regex) Regex { return r + "(" + joinRxs(rxs, "") + ")" } // CaptureName appends a named capture group to the regexes. The regexes // given as arguments are just concatenated. func (r Regex) CaptureName(name string, rxs ...Regex) Regex { return r + "(?P<" + Regex(name) + ">" + joinRxs(rxs, "") + ")" } // WithFlags appends a non-capturing group with the given flags set to the // regex. The regexes given as arguments are just concatenated. func (r Regex) WithFlags(flags Flags, rxs ...Regex) Regex { tmp := joinRxs(rxs, "") if f := buildFlagString(flags); f != "" { tmp = "(?" + Regex(f) + ":" + tmp + ")" } return r + tmp } // // UTILS // //nolint:gomnd // do not want to create constant for every length func protectMultiChar(rx Regex) Regex { rxs := string(rx) // a, ., ... isOneChar := len(rx) == 1 // \d, \b, ... isEscape := len(rx) == 2 && rx[0] == '\\' // Octals \123 isOctal := (2 <= len(rx) && len(rx) <= 4 && rx[0] == '\\' && areDigits(rxs[1:])) // literals \Q...\E isLiteral := (len(rx) >= 4 && rxs[0:2] == "\\Q" && rxs[len(rx)-2:] == "\\E") if isOneChar || isEscape || isCharClass(rx) || isUnicodeClass(rx) || isOctal || isHexa(rx) || isLiteral { return rx } return "(?:" + rx + ")" } // checks if the string is a single hexa nomber: \x2F or \x{10FFFF}. func isHexa(rx Regex) bool { s := string(rx) return (len(s) == 4 && s[0:2] == "\\x" && areHexaDigits(s[2:])) || (len(s) >= 5 && len(s) <= 10 && s[0] == '\\' && s[1] == 'x' && s[2] == '{' && s[len(s)-1] == '}' && areHexaDigits(s[3:len(rx)-1])) } // Checks if the regex is a single unicode character class: \pF and \PF, or // \p{Greek} and \P{Greek}, but not \p{Greek}\p{Digit}. func isUnicodeClass(rx Regex) bool { s := string(rx) return (len(s) == 3 && (s[0:2] == "\\p" || s[0:2] == "\\P")) || (len(s) >= 3 && (s[0:3] == "\\p{" || s[0:3] == "\\P{") && s[len(s)-1] == '}' && !strings.ContainsRune(s[1:len(s)-2], '}')) } // Checks if the regex is a single character class: [abc] but not [abc][def]. func isCharClass(rx Regex) bool { s := string(rx) return (len(s) >= 3 && s[0] == '[' && s[len(s)-1] == ']' && !strings.ContainsRune(s[1:len(s)-2], ']')) } func areDigits(s string) bool { for _, c := range s { if c <= '0' || '9' <= c { return false } } return true } func areHexaDigits(s string) bool { for _, c := range s { if !('0' <= c && c <= '9' || 'A' <= c && c <= 'F') { return false } } return true } func buildFlagString(flags Flags) string { set, cleared := "", "" if flags&CaseInsensitive != 0 { set += "i" } if flags&MultiLine != 0 { set += "m" } if flags&AnyNL != 0 { set += "s" } if flags&Ungreedy != 0 { set += "U" } if flags&CaseSensitive != 0 { cleared += "i" } if flags&SingleLine != 0 { cleared += "m" } if flags&AnyNoNL != 0 { cleared += "s" } if flags&Greedy != 0 { cleared += "U" } if cleared != "" { cleared = "-" + cleared } return set + cleared } func joinRxs(rxs []Regex, sep string) Regex { var rv Regex for i := range rxs { if i != 0 { rv += Regex(sep) } rv += rxs[i] } return rv }