Horizon
lexer.hpp
1 #pragma once
2 
3 #include <clocale> // localeconv
4 #include <cstddef> // size_t
5 #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
6 #include <initializer_list> // initializer_list
7 #include <ios> // hex, uppercase
8 #include <iomanip> // setw, setfill
9 #include <sstream> // stringstream
10 #include <string> // char_traits, string
11 #include <vector> // vector
12 
13 #include <nlohmann/detail/macro_scope.hpp>
14 #include <nlohmann/detail/input/input_adapters.hpp>
15 
16 namespace nlohmann
17 {
18 namespace detail
19 {
21 // lexer //
23 
29 template<typename BasicJsonType>
30 class lexer
31 {
32  using number_integer_t = typename BasicJsonType::number_integer_t;
33  using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
34  using number_float_t = typename BasicJsonType::number_float_t;
35 
36  public:
38  enum class token_type
39  {
41  literal_true,
43  literal_null,
44  value_string,
47  value_float,
48  begin_array,
49  begin_object,
50  end_array,
51  end_object,
54  parse_error,
55  end_of_input,
57  };
58 
60  static const char* token_type_name(const token_type t) noexcept
61  {
62  switch (t)
63  {
65  return "<uninitialized>";
67  return "true literal";
69  return "false literal";
71  return "null literal";
73  return "string literal";
77  return "number literal";
79  return "'['";
81  return "'{'";
83  return "']'";
85  return "'}'";
87  return "':'";
89  return "','";
91  return "<parse error>";
93  return "end of input";
95  return "'[', '{', or a literal";
96  default: // catch non-enum values
97  return "unknown token"; // LCOV_EXCL_LINE
98  }
99  }
100 
101  explicit lexer(detail::input_adapter_t adapter)
102  : ia(std::move(adapter)), decimal_point_char(get_decimal_point()) {}
103 
104  // delete because of pointer members
105  lexer(const lexer&) = delete;
106  lexer& operator=(lexer&) = delete;
107 
108  private:
110  // locales
112 
114  static char get_decimal_point() noexcept
115  {
116  const auto loc = localeconv();
117  assert(loc != nullptr);
118  return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
119  }
120 
122  // scan functions
124 
140  int get_codepoint()
141  {
142  // this function only makes sense after reading `\u`
143  assert(current == 'u');
144  int codepoint = 0;
145 
146  const auto factors = { 12, 8, 4, 0 };
147  for (const auto factor : factors)
148  {
149  get();
150 
151  if (current >= '0' and current <= '9')
152  {
153  codepoint += ((current - 0x30) << factor);
154  }
155  else if (current >= 'A' and current <= 'F')
156  {
157  codepoint += ((current - 0x37) << factor);
158  }
159  else if (current >= 'a' and current <= 'f')
160  {
161  codepoint += ((current - 0x57) << factor);
162  }
163  else
164  {
165  return -1;
166  }
167  }
168 
169  assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
170  return codepoint;
171  }
172 
188  bool next_byte_in_range(std::initializer_list<int> ranges)
189  {
190  assert(ranges.size() == 2 or ranges.size() == 4 or ranges.size() == 6);
191  add(current);
192 
193  for (auto range = ranges.begin(); range != ranges.end(); ++range)
194  {
195  get();
196  if (JSON_LIKELY(*range <= current and current <= *(++range)))
197  {
198  add(current);
199  }
200  else
201  {
202  error_message = "invalid string: ill-formed UTF-8 byte";
203  return false;
204  }
205  }
206 
207  return true;
208  }
209 
225  token_type scan_string()
226  {
227  // reset token_buffer (ignore opening quote)
228  reset();
229 
230  // we entered the function by reading an open quote
231  assert(current == '\"');
232 
233  while (true)
234  {
235  // get next character
236  switch (get())
237  {
238  // end of file while parsing string
239  case std::char_traits<char>::eof():
240  {
241  error_message = "invalid string: missing closing quote";
243  }
244 
245  // closing quote
246  case '\"':
247  {
248  return token_type::value_string;
249  }
250 
251  // escapes
252  case '\\':
253  {
254  switch (get())
255  {
256  // quotation mark
257  case '\"':
258  add('\"');
259  break;
260  // reverse solidus
261  case '\\':
262  add('\\');
263  break;
264  // solidus
265  case '/':
266  add('/');
267  break;
268  // backspace
269  case 'b':
270  add('\b');
271  break;
272  // form feed
273  case 'f':
274  add('\f');
275  break;
276  // line feed
277  case 'n':
278  add('\n');
279  break;
280  // carriage return
281  case 'r':
282  add('\r');
283  break;
284  // tab
285  case 't':
286  add('\t');
287  break;
288 
289  // unicode escapes
290  case 'u':
291  {
292  const int codepoint1 = get_codepoint();
293  int codepoint = codepoint1; // start with codepoint1
294 
295  if (JSON_UNLIKELY(codepoint1 == -1))
296  {
297  error_message = "invalid string: '\\u' must be followed by 4 hex digits";
298  return token_type::parse_error;
299  }
300 
301  // check if code point is a high surrogate
302  if (0xD800 <= codepoint1 and codepoint1 <= 0xDBFF)
303  {
304  // expect next \uxxxx entry
305  if (JSON_LIKELY(get() == '\\' and get() == 'u'))
306  {
307  const int codepoint2 = get_codepoint();
308 
309  if (JSON_UNLIKELY(codepoint2 == -1))
310  {
311  error_message = "invalid string: '\\u' must be followed by 4 hex digits";
312  return token_type::parse_error;
313  }
314 
315  // check if codepoint2 is a low surrogate
316  if (JSON_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF))
317  {
318  // overwrite codepoint
319  codepoint =
320  // high surrogate occupies the most significant 22 bits
321  (codepoint1 << 10)
322  // low surrogate occupies the least significant 15 bits
323  + codepoint2
324  // there is still the 0xD800, 0xDC00 and 0x10000 noise
325  // in the result so we have to subtract with:
326  // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
327  - 0x35FDC00;
328  }
329  else
330  {
331  error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
332  return token_type::parse_error;
333  }
334  }
335  else
336  {
337  error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF";
338  return token_type::parse_error;
339  }
340  }
341  else
342  {
343  if (JSON_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF))
344  {
345  error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
346  return token_type::parse_error;
347  }
348  }
349 
350  // result of the above calculation yields a proper codepoint
351  assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
352 
353  // translate codepoint into bytes
354  if (codepoint < 0x80)
355  {
356  // 1-byte characters: 0xxxxxxx (ASCII)
357  add(codepoint);
358  }
359  else if (codepoint <= 0x7FF)
360  {
361  // 2-byte characters: 110xxxxx 10xxxxxx
362  add(0xC0 | (codepoint >> 6));
363  add(0x80 | (codepoint & 0x3F));
364  }
365  else if (codepoint <= 0xFFFF)
366  {
367  // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
368  add(0xE0 | (codepoint >> 12));
369  add(0x80 | ((codepoint >> 6) & 0x3F));
370  add(0x80 | (codepoint & 0x3F));
371  }
372  else
373  {
374  // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
375  add(0xF0 | (codepoint >> 18));
376  add(0x80 | ((codepoint >> 12) & 0x3F));
377  add(0x80 | ((codepoint >> 6) & 0x3F));
378  add(0x80 | (codepoint & 0x3F));
379  }
380 
381  break;
382  }
383 
384  // other characters after escape
385  default:
386  error_message = "invalid string: forbidden character after backslash";
387  return token_type::parse_error;
388  }
389 
390  break;
391  }
392 
393  // invalid control characters
394  case 0x00:
395  case 0x01:
396  case 0x02:
397  case 0x03:
398  case 0x04:
399  case 0x05:
400  case 0x06:
401  case 0x07:
402  case 0x08:
403  case 0x09:
404  case 0x0A:
405  case 0x0B:
406  case 0x0C:
407  case 0x0D:
408  case 0x0E:
409  case 0x0F:
410  case 0x10:
411  case 0x11:
412  case 0x12:
413  case 0x13:
414  case 0x14:
415  case 0x15:
416  case 0x16:
417  case 0x17:
418  case 0x18:
419  case 0x19:
420  case 0x1A:
421  case 0x1B:
422  case 0x1C:
423  case 0x1D:
424  case 0x1E:
425  case 0x1F:
426  {
427  error_message = "invalid string: control character must be escaped";
428  return token_type::parse_error;
429  }
430 
431  // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
432  case 0x20:
433  case 0x21:
434  case 0x23:
435  case 0x24:
436  case 0x25:
437  case 0x26:
438  case 0x27:
439  case 0x28:
440  case 0x29:
441  case 0x2A:
442  case 0x2B:
443  case 0x2C:
444  case 0x2D:
445  case 0x2E:
446  case 0x2F:
447  case 0x30:
448  case 0x31:
449  case 0x32:
450  case 0x33:
451  case 0x34:
452  case 0x35:
453  case 0x36:
454  case 0x37:
455  case 0x38:
456  case 0x39:
457  case 0x3A:
458  case 0x3B:
459  case 0x3C:
460  case 0x3D:
461  case 0x3E:
462  case 0x3F:
463  case 0x40:
464  case 0x41:
465  case 0x42:
466  case 0x43:
467  case 0x44:
468  case 0x45:
469  case 0x46:
470  case 0x47:
471  case 0x48:
472  case 0x49:
473  case 0x4A:
474  case 0x4B:
475  case 0x4C:
476  case 0x4D:
477  case 0x4E:
478  case 0x4F:
479  case 0x50:
480  case 0x51:
481  case 0x52:
482  case 0x53:
483  case 0x54:
484  case 0x55:
485  case 0x56:
486  case 0x57:
487  case 0x58:
488  case 0x59:
489  case 0x5A:
490  case 0x5B:
491  case 0x5D:
492  case 0x5E:
493  case 0x5F:
494  case 0x60:
495  case 0x61:
496  case 0x62:
497  case 0x63:
498  case 0x64:
499  case 0x65:
500  case 0x66:
501  case 0x67:
502  case 0x68:
503  case 0x69:
504  case 0x6A:
505  case 0x6B:
506  case 0x6C:
507  case 0x6D:
508  case 0x6E:
509  case 0x6F:
510  case 0x70:
511  case 0x71:
512  case 0x72:
513  case 0x73:
514  case 0x74:
515  case 0x75:
516  case 0x76:
517  case 0x77:
518  case 0x78:
519  case 0x79:
520  case 0x7A:
521  case 0x7B:
522  case 0x7C:
523  case 0x7D:
524  case 0x7E:
525  case 0x7F:
526  {
527  add(current);
528  break;
529  }
530 
531  // U+0080..U+07FF: bytes C2..DF 80..BF
532  case 0xC2:
533  case 0xC3:
534  case 0xC4:
535  case 0xC5:
536  case 0xC6:
537  case 0xC7:
538  case 0xC8:
539  case 0xC9:
540  case 0xCA:
541  case 0xCB:
542  case 0xCC:
543  case 0xCD:
544  case 0xCE:
545  case 0xCF:
546  case 0xD0:
547  case 0xD1:
548  case 0xD2:
549  case 0xD3:
550  case 0xD4:
551  case 0xD5:
552  case 0xD6:
553  case 0xD7:
554  case 0xD8:
555  case 0xD9:
556  case 0xDA:
557  case 0xDB:
558  case 0xDC:
559  case 0xDD:
560  case 0xDE:
561  case 0xDF:
562  {
563  if (JSON_UNLIKELY(not next_byte_in_range({0x80, 0xBF})))
564  {
565  return token_type::parse_error;
566  }
567  break;
568  }
569 
570  // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
571  case 0xE0:
572  {
573  if (JSON_UNLIKELY(not (next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
574  {
575  return token_type::parse_error;
576  }
577  break;
578  }
579 
580  // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
581  // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
582  case 0xE1:
583  case 0xE2:
584  case 0xE3:
585  case 0xE4:
586  case 0xE5:
587  case 0xE6:
588  case 0xE7:
589  case 0xE8:
590  case 0xE9:
591  case 0xEA:
592  case 0xEB:
593  case 0xEC:
594  case 0xEE:
595  case 0xEF:
596  {
597  if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
598  {
599  return token_type::parse_error;
600  }
601  break;
602  }
603 
604  // U+D000..U+D7FF: bytes ED 80..9F 80..BF
605  case 0xED:
606  {
607  if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
608  {
609  return token_type::parse_error;
610  }
611  break;
612  }
613 
614  // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
615  case 0xF0:
616  {
617  if (JSON_UNLIKELY(not (next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
618  {
619  return token_type::parse_error;
620  }
621  break;
622  }
623 
624  // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
625  case 0xF1:
626  case 0xF2:
627  case 0xF3:
628  {
629  if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
630  {
631  return token_type::parse_error;
632  }
633  break;
634  }
635 
636  // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
637  case 0xF4:
638  {
639  if (JSON_UNLIKELY(not (next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
640  {
641  return token_type::parse_error;
642  }
643  break;
644  }
645 
646  // remaining bytes (80..C1 and F5..FF) are ill-formed
647  default:
648  {
649  error_message = "invalid string: ill-formed UTF-8 byte";
650  return token_type::parse_error;
651  }
652  }
653  }
654  }
655 
656  static void strtof(float& f, const char* str, char** endptr) noexcept
657  {
658  f = std::strtof(str, endptr);
659  }
660 
661  static void strtof(double& f, const char* str, char** endptr) noexcept
662  {
663  f = std::strtod(str, endptr);
664  }
665 
666  static void strtof(long double& f, const char* str, char** endptr) noexcept
667  {
668  f = std::strtold(str, endptr);
669  }
670 
711  token_type scan_number()
712  {
713  // reset token_buffer to store the number's bytes
714  reset();
715 
716  // the type of the parsed number; initially set to unsigned; will be
717  // changed if minus sign, decimal point or exponent is read
718  token_type number_type = token_type::value_unsigned;
719 
720  // state (init): we just found out we need to scan a number
721  switch (current)
722  {
723  case '-':
724  {
725  add(current);
726  goto scan_number_minus;
727  }
728 
729  case '0':
730  {
731  add(current);
732  goto scan_number_zero;
733  }
734 
735  case '1':
736  case '2':
737  case '3':
738  case '4':
739  case '5':
740  case '6':
741  case '7':
742  case '8':
743  case '9':
744  {
745  add(current);
746  goto scan_number_any1;
747  }
748 
749  default:
750  {
751  // all other characters are rejected outside scan_number()
752  assert(false); // LCOV_EXCL_LINE
753  }
754  }
755 
756 scan_number_minus:
757  // state: we just parsed a leading minus sign
758  number_type = token_type::value_integer;
759  switch (get())
760  {
761  case '0':
762  {
763  add(current);
764  goto scan_number_zero;
765  }
766 
767  case '1':
768  case '2':
769  case '3':
770  case '4':
771  case '5':
772  case '6':
773  case '7':
774  case '8':
775  case '9':
776  {
777  add(current);
778  goto scan_number_any1;
779  }
780 
781  default:
782  {
783  error_message = "invalid number; expected digit after '-'";
784  return token_type::parse_error;
785  }
786  }
787 
788 scan_number_zero:
789  // state: we just parse a zero (maybe with a leading minus sign)
790  switch (get())
791  {
792  case '.':
793  {
794  add(decimal_point_char);
795  goto scan_number_decimal1;
796  }
797 
798  case 'e':
799  case 'E':
800  {
801  add(current);
802  goto scan_number_exponent;
803  }
804 
805  default:
806  goto scan_number_done;
807  }
808 
809 scan_number_any1:
810  // state: we just parsed a number 0-9 (maybe with a leading minus sign)
811  switch (get())
812  {
813  case '0':
814  case '1':
815  case '2':
816  case '3':
817  case '4':
818  case '5':
819  case '6':
820  case '7':
821  case '8':
822  case '9':
823  {
824  add(current);
825  goto scan_number_any1;
826  }
827 
828  case '.':
829  {
830  add(decimal_point_char);
831  goto scan_number_decimal1;
832  }
833 
834  case 'e':
835  case 'E':
836  {
837  add(current);
838  goto scan_number_exponent;
839  }
840 
841  default:
842  goto scan_number_done;
843  }
844 
845 scan_number_decimal1:
846  // state: we just parsed a decimal point
847  number_type = token_type::value_float;
848  switch (get())
849  {
850  case '0':
851  case '1':
852  case '2':
853  case '3':
854  case '4':
855  case '5':
856  case '6':
857  case '7':
858  case '8':
859  case '9':
860  {
861  add(current);
862  goto scan_number_decimal2;
863  }
864 
865  default:
866  {
867  error_message = "invalid number; expected digit after '.'";
868  return token_type::parse_error;
869  }
870  }
871 
872 scan_number_decimal2:
873  // we just parsed at least one number after a decimal point
874  switch (get())
875  {
876  case '0':
877  case '1':
878  case '2':
879  case '3':
880  case '4':
881  case '5':
882  case '6':
883  case '7':
884  case '8':
885  case '9':
886  {
887  add(current);
888  goto scan_number_decimal2;
889  }
890 
891  case 'e':
892  case 'E':
893  {
894  add(current);
895  goto scan_number_exponent;
896  }
897 
898  default:
899  goto scan_number_done;
900  }
901 
902 scan_number_exponent:
903  // we just parsed an exponent
904  number_type = token_type::value_float;
905  switch (get())
906  {
907  case '+':
908  case '-':
909  {
910  add(current);
911  goto scan_number_sign;
912  }
913 
914  case '0':
915  case '1':
916  case '2':
917  case '3':
918  case '4':
919  case '5':
920  case '6':
921  case '7':
922  case '8':
923  case '9':
924  {
925  add(current);
926  goto scan_number_any2;
927  }
928 
929  default:
930  {
931  error_message =
932  "invalid number; expected '+', '-', or digit after exponent";
933  return token_type::parse_error;
934  }
935  }
936 
937 scan_number_sign:
938  // we just parsed an exponent sign
939  switch (get())
940  {
941  case '0':
942  case '1':
943  case '2':
944  case '3':
945  case '4':
946  case '5':
947  case '6':
948  case '7':
949  case '8':
950  case '9':
951  {
952  add(current);
953  goto scan_number_any2;
954  }
955 
956  default:
957  {
958  error_message = "invalid number; expected digit after exponent sign";
959  return token_type::parse_error;
960  }
961  }
962 
963 scan_number_any2:
964  // we just parsed a number after the exponent or exponent sign
965  switch (get())
966  {
967  case '0':
968  case '1':
969  case '2':
970  case '3':
971  case '4':
972  case '5':
973  case '6':
974  case '7':
975  case '8':
976  case '9':
977  {
978  add(current);
979  goto scan_number_any2;
980  }
981 
982  default:
983  goto scan_number_done;
984  }
985 
986 scan_number_done:
987  // unget the character after the number (we only read it to know that
988  // we are done scanning a number)
989  unget();
990 
991  char* endptr = nullptr;
992  errno = 0;
993 
994  // try to parse integers first and fall back to floats
995  if (number_type == token_type::value_unsigned)
996  {
997  const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
998 
999  // we checked the number format before
1000  assert(endptr == token_buffer.data() + token_buffer.size());
1001 
1002  if (errno == 0)
1003  {
1004  value_unsigned = static_cast<number_unsigned_t>(x);
1005  if (value_unsigned == x)
1006  {
1007  return token_type::value_unsigned;
1008  }
1009  }
1010  }
1011  else if (number_type == token_type::value_integer)
1012  {
1013  const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1014 
1015  // we checked the number format before
1016  assert(endptr == token_buffer.data() + token_buffer.size());
1017 
1018  if (errno == 0)
1019  {
1020  value_integer = static_cast<number_integer_t>(x);
1021  if (value_integer == x)
1022  {
1023  return token_type::value_integer;
1024  }
1025  }
1026  }
1027 
1028  // this code is reached if we parse a floating-point number or if an
1029  // integer conversion above failed
1030  strtof(value_float, token_buffer.data(), &endptr);
1031 
1032  // we checked the number format before
1033  assert(endptr == token_buffer.data() + token_buffer.size());
1034 
1035  return token_type::value_float;
1036  }
1037 
1043  token_type scan_literal(const char* literal_text, const std::size_t length,
1044  token_type return_type)
1045  {
1046  assert(current == literal_text[0]);
1047  for (std::size_t i = 1; i < length; ++i)
1048  {
1049  if (JSON_UNLIKELY(get() != literal_text[i]))
1050  {
1051  error_message = "invalid literal";
1052  return token_type::parse_error;
1053  }
1054  }
1055  return return_type;
1056  }
1057 
1059  // input management
1061 
1063  void reset() noexcept
1064  {
1065  token_buffer.clear();
1066  token_string.clear();
1067  token_string.push_back(std::char_traits<char>::to_char_type(current));
1068  }
1069 
1070  /*
1071  @brief get next character from the input
1072 
1073  This function provides the interface to the used input adapter. It does
1074  not throw in case the input reached EOF, but returns a
1075  `std::char_traits<char>::eof()` in that case. Stores the scanned characters
1076  for use in error messages.
1077 
1078  @return character read from the input
1079  */
1080  std::char_traits<char>::int_type get()
1081  {
1082  ++chars_read;
1083  current = ia->get_character();
1084  if (JSON_LIKELY(current != std::char_traits<char>::eof()))
1085  {
1086  token_string.push_back(std::char_traits<char>::to_char_type(current));
1087  }
1088  return current;
1089  }
1090 
1092  void unget()
1093  {
1094  --chars_read;
1095  if (JSON_LIKELY(current != std::char_traits<char>::eof()))
1096  {
1097  ia->unget_character();
1098  assert(token_string.size() != 0);
1099  token_string.pop_back();
1100  }
1101  }
1102 
1104  void add(int c)
1105  {
1106  token_buffer.push_back(std::char_traits<char>::to_char_type(c));
1107  }
1108 
1109  public:
1111  // value getters
1113 
1115  constexpr number_integer_t get_number_integer() const noexcept
1116  {
1117  return value_integer;
1118  }
1119 
1121  constexpr number_unsigned_t get_number_unsigned() const noexcept
1122  {
1123  return value_unsigned;
1124  }
1125 
1127  constexpr number_float_t get_number_float() const noexcept
1128  {
1129  return value_float;
1130  }
1131 
1134  {
1135  return std::move(token_buffer);
1136  }
1137 
1139  // diagnostics
1141 
1143  constexpr std::size_t get_position() const noexcept
1144  {
1145  return chars_read;
1146  }
1147 
1152  {
1153  // escape control characters
1154  std::string result;
1155  for (const auto c : token_string)
1156  {
1157  if ('\x00' <= c and c <= '\x1F')
1158  {
1159  // escape control characters
1160  std::stringstream ss;
1161  ss << "<U+" << std::setw(4) << std::uppercase << std::setfill('0')
1162  << std::hex << static_cast<int>(c) << ">";
1163  result += ss.str();
1164  }
1165  else
1166  {
1167  // add character as is
1168  result.push_back(c);
1169  }
1170  }
1171 
1172  return result;
1173  }
1174 
1176  constexpr const char* get_error_message() const noexcept
1177  {
1178  return error_message;
1179  }
1180 
1182  // actual scanner
1184 
1185  token_type scan()
1186  {
1187  // read next character and ignore whitespace
1188  do
1189  {
1190  get();
1191  }
1192  while (current == ' ' or current == '\t' or current == '\n' or current == '\r');
1193 
1194  switch (current)
1195  {
1196  // structural characters
1197  case '[':
1198  return token_type::begin_array;
1199  case ']':
1200  return token_type::end_array;
1201  case '{':
1202  return token_type::begin_object;
1203  case '}':
1204  return token_type::end_object;
1205  case ':':
1206  return token_type::name_separator;
1207  case ',':
1208  return token_type::value_separator;
1209 
1210  // literals
1211  case 't':
1212  return scan_literal("true", 4, token_type::literal_true);
1213  case 'f':
1214  return scan_literal("false", 5, token_type::literal_false);
1215  case 'n':
1216  return scan_literal("null", 4, token_type::literal_null);
1217 
1218  // string
1219  case '\"':
1220  return scan_string();
1221 
1222  // number
1223  case '-':
1224  case '0':
1225  case '1':
1226  case '2':
1227  case '3':
1228  case '4':
1229  case '5':
1230  case '6':
1231  case '7':
1232  case '8':
1233  case '9':
1234  return scan_number();
1235 
1236  // end of input (the null byte is needed when parsing from
1237  // string literals)
1238  case '\0':
1239  case std::char_traits<char>::eof():
1240  return token_type::end_of_input;
1241 
1242  // error
1243  default:
1244  error_message = "invalid literal";
1245  return token_type::parse_error;
1246  }
1247  }
1248 
1249  private:
1251  detail::input_adapter_t ia = nullptr;
1252 
1254  std::char_traits<char>::int_type current = std::char_traits<char>::eof();
1255 
1257  std::size_t chars_read = 0;
1258 
1260  std::vector<char> token_string {};
1261 
1263  std::string token_buffer {};
1264 
1266  const char* error_message = "";
1267 
1268  // number values
1269  number_integer_t value_integer = 0;
1270  number_unsigned_t value_unsigned = 0;
1271  number_float_t value_float = 0;
1272 
1274  const char decimal_point_char = '.';
1275 };
1276 }
1277 }
nlohmann::detail::lexer::token_type::begin_object
the character for object begin {
nlohmann::detail::lexer::token_type::literal_true
the true literal
nlohmann::detail::lexer::move_string
std::string move_string()
return current string value (implicitly resets the token; useful only once)
Definition: lexer.hpp:1133
nlohmann::detail::lexer::token_type::parse_error
indicating a parse error
nlohmann::detail::lexer::get_token_string
std::string get_token_string() const
return the last read token (for errors only).
Definition: lexer.hpp:1151
nlohmann
namespace for Niels Lohmann
Definition: adl_serializer.hpp:8
nlohmann::detail::lexer::token_type::value_unsigned
an unsigned integer – use get_number_unsigned() for actual value
nlohmann::detail::lexer::token_type
token_type
token types for the parser
Definition: lexer.hpp:38
nlohmann::detail::lexer
lexical analysis
Definition: lexer.hpp:30
nlohmann::detail::lexer::token_type::value_integer
a signed integer – use get_number_integer() for actual value
nlohmann::detail::input_adapter_t
std::shared_ptr< input_adapter_protocol > input_adapter_t
a type to simplify interfaces
Definition: input_adapters.hpp:48
nlohmann::detail::value_t::string
string value
nlohmann::detail::lexer::get_position
constexpr std::size_t get_position() const noexcept
return position of last read token
Definition: lexer.hpp:1143
nlohmann::detail::lexer::token_type::end_of_input
indicating the end of the input buffer
nlohmann::detail::lexer::token_type_name
static const char * token_type_name(const token_type t) noexcept
return name of values of type token_type (only used for errors)
Definition: lexer.hpp:60
nlohmann::detail::lexer::token_type::uninitialized
indicating the scanner is uninitialized
nlohmann::detail::lexer::get_number_unsigned
constexpr number_unsigned_t get_number_unsigned() const noexcept
return unsigned integer value
Definition: lexer.hpp:1121
nlohmann::detail::lexer::token_type::end_array
the character for array end ]
nlohmann::detail::lexer::token_type::value_separator
the value separator ,
nlohmann::detail::lexer::token_type::literal_null
the null literal
nlohmann::detail::lexer::get_number_integer
constexpr number_integer_t get_number_integer() const noexcept
return integer value
Definition: lexer.hpp:1115
nlohmann::detail::lexer::token_type::literal_false
the false literal
nlohmann::detail::lexer::token_type::name_separator
the name separator :
nlohmann::detail::lexer::get_error_message
constexpr const char * get_error_message() const noexcept
return syntax error message
Definition: lexer.hpp:1176
nlohmann::detail::lexer::get_number_float
constexpr number_float_t get_number_float() const noexcept
return floating-point value
Definition: lexer.hpp:1127
nlohmann::detail::lexer::token_type::literal_or_value
a literal or the begin of a value (only for diagnostics)
nlohmann::detail::lexer::token_type::value_float
an floating point number – use get_number_float() for actual value
nlohmann::detail::lexer::token_type::begin_array
the character for array begin [
nlohmann::detail::lexer::token_type::value_string
a string – use get_string() for actual value
nlohmann::detail::lexer::token_type::end_object
the character for object end }