Scanner.h
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Modifications copyright 2017 Universite catholique de Louvain (UCL), Belgium
4 
5 Licensed under the Apache License, Version 2.0 (the "License");
6 you may not use this file except in compliance with the License.
7 You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 ==============================================================================*/
17 
18 #pragma once
19 
20 #include <strings/StringPiece.h>
21 
22 namespace momemta {
23 namespace strings {
24 
25 // Scanner provides simplified string parsing, in which a string is parsed as a
26 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
27 // finally GetResult is called. If GetResult returns true, then it also returns
28 // the remaining characters and any captured substring.
29 //
30 // The range to capture can be controlled with RestartCapture and StopCapture;
31 // by default, all processed characters are captured.
32 class Scanner {
33  public:
34  // Classes of characters. Each enum name is to be read as the union of the
35  // parts - e.g., class LETTER_DIGIT means the class includes all letters and
36  // all digits.
37  //
38  // LETTER means ascii letter a-zA-Z.
39  // DIGIT means ascii digit: 0-9.
40  enum CharClass {
41  // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
42  // in scanner_test.cc
43  ALL,
44  DIGIT,
45  LETTER,
46  LETTER_DIGIT,
47  LETTER_DIGIT_DASH_UNDERSCORE,
48  LETTER_DIGIT_DASH_DOT_SLASH, // SLASH is / only, not backslash
49  LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE, // SLASH is / only, not backslash
50  LETTER_DIGIT_DOT,
51  LETTER_DIGIT_DOT_PLUS_MINUS,
52  LETTER_DIGIT_DOT_UNDERSCORE,
53  LETTER_DIGIT_UNDERSCORE,
54  LETTER_DIGIT_UNDERSCORE_COLON,
55  LOWERLETTER,
56  LOWERLETTER_DIGIT,
57  LOWERLETTER_DIGIT_UNDERSCORE,
58  NON_ZERO_DIGIT,
59  SPACE,
60  UPPERLETTER,
61  };
62 
63  explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
64 
65  // Consume the next character of the given class from input. If the next
66  // character is not in the class, then GetResult will ultimately return false.
67  Scanner& One(CharClass clz) {
68  if (cur_.empty() || !Matches(clz, cur_[0])) {
69  return Error();
70  }
71  cur_.remove_prefix(1);
72  return *this;
73  }
74 
75  // Consume the next s.size() characters of the input, if they match <s>. If
76  // they don't match <s>, this is a no-op.
77  Scanner& ZeroOrOneLiteral(StringPiece s) {
78  cur_.Consume(s);
79  return *this;
80  }
81 
82  // Consume the next s.size() characters of the input, if they match <s>. If
83  // they don't match <s>, then GetResult will ultimately return false.
84  Scanner& OneLiteral(StringPiece s) {
85  if (!cur_.Consume(s)) {
86  error_ = true;
87  }
88  return *this;
89  }
90 
91  // Consume characters from the input as long as they match <clz>. Zero
92  // characters is still considered a match, so it will never cause GetResult to
93  // return false.
94  Scanner& Any(CharClass clz) {
95  while (!cur_.empty() && Matches(clz, cur_[0])) {
96  cur_.remove_prefix(1);
97  }
98  return *this;
99  }
100 
101  // Shorthand for One(clz).Any(clz).
102  Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
103 
104  // Reset the capture start point.
105  //
106  // Later, when GetResult is called and if it returns true, the capture
107  // returned will start at the position at the time this was called.
108  Scanner& RestartCapture() {
109  capture_start_ = cur_.data();
110  capture_end_ = nullptr;
111  return *this;
112  }
113 
114  // Stop capturing input.
115  //
116  // Later, when GetResult is called and if it returns true, the capture
117  // returned will end at the position at the time this was called.
118  Scanner& StopCapture() {
119  capture_end_ = cur_.data();
120  return *this;
121  }
122 
123  // If not at the input of input, then GetResult will ultimately return false.
124  Scanner& Eos() {
125  if (!cur_.empty()) error_ = true;
126  return *this;
127  }
128 
129  // Shorthand for Any(SPACE).
130  Scanner& AnySpace() { return Any(SPACE); }
131 
132  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
133  Scanner& ScanUntil(char end_ch) {
134  ScanUntilImpl(end_ch, false);
135  return *this;
136  }
137 
138  // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
139  // Backslash escape sequences are skipped.
140  // Used for implementing quoted string scanning.
141  Scanner& ScanEscapedUntil(char end_ch) {
142  ScanUntilImpl(end_ch, true);
143  return *this;
144  }
145 
146  // Return the next character that will be scanned, or <default_value> if there
147  // are no more characters to scan.
148  // Note that if a scan operation has failed (so GetResult() returns false),
149  // then the value of Peek may or may not have advanced since the scan
150  // operation that failed.
151  char Peek(char default_value = '\0') const {
152  return cur_.empty() ? default_value : cur_[0];
153  }
154 
155  // Returns false if there are no remaining characters to consume.
156  int empty() const { return cur_.empty(); }
157 
158  // Returns true if the input string successfully matched. When true is
159  // returned, the remaining string is returned in <remaining> and the captured
160  // string returned in <capture>, if non-NULL.
161  bool GetResult(StringPiece* remaining = nullptr,
162  StringPiece* capture = nullptr);
163 
164  private:
165  void ScanUntilImpl(char end_ch, bool escaped);
166 
167  Scanner& Error() {
168  error_ = true;
169  return *this;
170  }
171 
172  static bool IsLetter(char ch) {
173  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
174  }
175 
176  static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
177 
178  static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
179 
180  static bool IsSpace(char ch) {
181  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
182  ch == '\r');
183  }
184 
185 public:
186  static bool Matches(CharClass clz, char ch) {
187  switch (clz) {
188  case ALL:
189  return true;
190  case DIGIT:
191  return IsDigit(ch);
192  case LETTER:
193  return IsLetter(ch);
194  case LETTER_DIGIT:
195  return IsLetter(ch) || IsDigit(ch);
196  case LETTER_DIGIT_DASH_UNDERSCORE:
197  return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
198  case LETTER_DIGIT_DASH_DOT_SLASH:
199  return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
200  ch == '/';
201  case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
202  return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
203  ch == '/' || ch == '_');
204  case LETTER_DIGIT_DOT:
205  return IsLetter(ch) || IsDigit(ch) || ch == '.';
206  case LETTER_DIGIT_DOT_PLUS_MINUS:
207  return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
208  ch == '.';
209  case LETTER_DIGIT_DOT_UNDERSCORE:
210  return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
211  case LETTER_DIGIT_UNDERSCORE:
212  return IsLetter(ch) || IsDigit(ch) || ch == '_';
213  case LETTER_DIGIT_UNDERSCORE_COLON:
214  return IsLetter(ch) || IsDigit(ch) || ch == '_' || ch == ':';
215  case LOWERLETTER:
216  return ch >= 'a' && ch <= 'z';
217  case LOWERLETTER_DIGIT:
218  return IsLowerLetter(ch) || IsDigit(ch);
219  case LOWERLETTER_DIGIT_UNDERSCORE:
220  return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
221  case NON_ZERO_DIGIT:
222  return IsDigit(ch) && ch != '0';
223  case SPACE:
224  return IsSpace(ch);
225  case UPPERLETTER:
226  return ch >= 'A' && ch <= 'Z';
227  }
228  return false;
229  }
230 
231 private:
232 
233  StringPiece cur_;
234  const char* capture_start_ = nullptr;
235  const char* capture_end_ = nullptr;
236  bool error_ = false;
237 
238  Scanner(const Scanner&) = delete;
239  void operator=(const Scanner&) = delete;
240 };
241 
242 } // namespace strings
243 } // namespace momemta
Definition: Graph.h:21