From e47659149591da19ec844f691260ae4b0e75647a Mon Sep 17 00:00:00 2001 From: Robin Jadoul Date: Mon, 25 Apr 2016 20:39:38 +0200 Subject: [PATCH] Finish regex parsing --- src/re.cpp | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 6 deletions(-) diff --git a/src/re.cpp b/src/re.cpp index a51b875..84318b4 100644 --- a/src/re.cpp +++ b/src/re.cpp @@ -119,7 +119,7 @@ namespace lxs { stk.push(tp); } - std::shared_ptr parseEscapeChar(char c) { + char parseEscapeChar(char c) { switch (c) { case '\\': @@ -154,7 +154,113 @@ namespace lxs { default: throw SyntaxError(("Invalid escape sequence: \\" + std::string(1, c)).c_str()); } - return std::make_shared(c); + return c; + } + + void sumREs(std::vector >& res) + { + for (std::size_t step = 2; step < res.size(); step <<= 1) + { + for (std::size_t i = 0; i < res.size(); i += step) + { + if (i + step / 2 < res.size()) + res[i] = std::make_shared(res[i], res[i + step / 2]); + } + } + } + + std::shared_ptr parseCharacterClass(string& input, size_t& idx) { + if (idx >= input.size()) + throw SyntaxError("Unclosed character class"); + std::vector > chars; + std::set used_chars; + + bool invert = false; + + if (input[idx] == '^') + { + invert = true; + idx++; + } + + if (idx >= input.size()) + throw SyntaxError("Unclosed character class"); + + + if (input[idx] == ']') + { + used_chars.insert(']'); + idx++; + } + + if (idx >= input.size()) + throw SyntaxError("Unclosed character class"); + + if (input[idx] == '-') + { + used_chars.insert('-'); + idx++; + } + + if (idx >= input.size()) + throw SyntaxError("Unclosed character class"); + + for (; idx < input.size() && input[idx] != ']'; idx++) + { + if (input[idx] == '-') + { + idx++; + + if (idx >= input.size()) + throw SyntaxError("Unclosed character class"); + + if (input[idx] == ']') + { + used_chars.insert('-'); + } + else + { + for (int i = ((SingleRE*)(chars[chars.size() - 1].get()))->c + 1; i <= input[idx]; i++) + { + used_chars.insert((char) i); + } + } + } + else if (input[idx] == '\\') + { + idx++; + if (idx >= input.size()) + throw SyntaxError("Unclosed character classe"); + used_chars.insert(parseEscapeChar(input[idx])); + } + else + { + used_chars.insert(input[idx]); + } + } + + if (idx >= input.size()) + throw SyntaxError("Unclosed character class"); + idx++; //Eat the ] + + for (int i = 0; i < 256; i++) + { + if (invert ^ (used_chars.count((char) i) > 0)) + chars.push_back(std::make_shared((char) i)); + } + + sumREs(chars); + + return chars[0]; + } + + std::shared_ptr dotChar() { + std::vector > any; + for (int i = 0; i < 256; i++) + if ((char) i != '\n') //Dot matches anything except newlines + any.push_back(std::make_shared((char) i)); + sumREs(any); + return any[0]; } std::shared_ptr parseRE(string& input, size_t& idx) @@ -170,16 +276,19 @@ namespace lxs { if (idx >= input.length()) throw SyntaxError("Escape sequence at the end of the string"); else - throw SyntaxError(("invalid escape sequence: \\" + string(1, input[idx])).c_str()); - stk.push(parseEscapeChar(input[idx++])); + stk.push(std::make_shared(parseEscapeChar(input[idx]))); break; case '[': - //TODO: parse character classes + stk.push(parseCharacterClass(input, ++idx)); break; case '.': - //TODO: any character + for (int c = 0; c <= 256; c++) + { + stk.push(dotChar()); + } + break; case ']': throw SyntaxError("Unopened ']'");