lpeg-1.1.0/0000775000175000017500000000000014446336477012376 5ustar robertorobertolpeg-1.1.0/lptypes.h0000664000175000017500000000564514446336477014261 0ustar robertoroberto/* ** LPeg - PEG pattern matching for Lua ** Copyright 2007-2023, Lua.org & PUC-Rio (see 'lpeg.html' for license) ** written by Roberto Ierusalimschy */ #if !defined(lptypes_h) #define lptypes_h #include #include #include #include "lua.h" #define VERSION "1.1.0" #define PATTERN_T "lpeg-pattern" #define MAXSTACKIDX "lpeg-maxstack" /* ** compatibility with Lua 5.1 */ #if (LUA_VERSION_NUM == 501) #define lp_equal lua_equal #define lua_getuservalue lua_getfenv #define lua_setuservalue lua_setfenv #define lua_rawlen lua_objlen #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) typedef size_t lua_Unsigned; #endif #if !defined(lp_equal) #define lp_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) #endif /* default maximum size for call/backtrack stack */ #if !defined(MAXBACK) #define MAXBACK 400 #endif /* maximum number of rules in a grammar (limited by 'unsigned short') */ #if !defined(MAXRULES) #define MAXRULES 1000 #endif /* initial size for capture's list */ #define INITCAPSIZE 32 /* index, on Lua stack, for subject */ #define SUBJIDX 2 /* number of fixed arguments to 'match' (before capture arguments) */ #define FIXEDARGS 3 /* index, on Lua stack, for capture list */ #define caplistidx(ptop) ((ptop) + 2) /* index, on Lua stack, for pattern's ktable */ #define ktableidx(ptop) ((ptop) + 3) /* index, on Lua stack, for backtracking stack */ #define stackidx(ptop) ((ptop) + 4) typedef unsigned char byte; typedef unsigned int uint; #define BITSPERCHAR 8 #define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1) typedef struct Charset { byte cs[CHARSETSIZE]; } Charset; #define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} } #define fillset(s,c) memset(s,c,CHARSETSIZE) #define clearset(s) fillset(s,0) /* number of slots needed for 'n' bytes */ #define bytes2slots(n) (((n) - 1u) / (uint)sizeof(TTree) + 1u) /* set 'b' bit in charset 'cs' */ #define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7))) /* ** in capture instructions, 'kind' of capture and its offset are ** packed in field 'aux', 4 bits for each */ #define getkind(op) ((op)->i.aux1 & 0xF) #define getoff(op) (((op)->i.aux1 >> 4) & 0xF) #define joinkindoff(k,o) ((k) | ((o) << 4)) #define MAXOFF 0xF #define MAXAUX 0xFF /* maximum number of bytes to look behind */ #define MAXBEHIND MAXAUX /* maximum size (in elements) for a pattern */ #define MAXPATTSIZE (SHRT_MAX - 10) /* size (in instructions) for l bytes (l > 0) */ #define instsize(l) ((int)(((l) + (uint)sizeof(Instruction) - 1u) \ / (uint)sizeof(Instruction))) /* size (in elements) for a ISet instruction */ #define CHARSETINSTSIZE (1 + instsize(CHARSETSIZE)) /* size (in elements) for a IFunc instruction */ #define funcinstsize(p) ((p)->i.aux + 2) #define testchar(st,c) ((((uint)(st)[((c) >> 3)]) >> ((c) & 7)) & 1) #endif lpeg-1.1.0/lpvm.c0000664000175000017500000003505714446336477013532 0ustar robertoroberto #include #include #include "lua.h" #include "lauxlib.h" #include "lpcap.h" #include "lptypes.h" #include "lpvm.h" #include "lpprint.h" /* initial size for call/backtrack stack */ #if !defined(INITBACK) #define INITBACK MAXBACK #endif #define getoffset(p) (((p) + 1)->offset) static const Instruction giveup = {{IGiveup, 0, {0}}}; int charinset (const Instruction *i, const byte *buff, uint c) { c -= i->i.aux2.set.offset; if (c >= ((uint)i->i.aux2.set.size /* size in instructions... */ * (uint)sizeof(Instruction) /* in bytes... */ * 8u)) /* in bits */ return i->i.aux1; /* out of range; return default value */ return testchar(buff, c); } /* ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. */ static const char *utf8_decode (const char *o, int *val) { static const uint limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu}; const unsigned char *s = (const unsigned char *)o; uint c = s[0]; /* first byte */ uint res = 0; /* final result */ if (c < 0x80) /* ascii? */ res = c; else { int count = 0; /* to count number of continuation bytes */ while (c & 0x40) { /* still have continuation bytes? */ int cc = s[++count]; /* read next byte */ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ return NULL; /* invalid byte sequence */ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ c <<= 1; /* to test next bit */ } res |= (c & 0x7F) << (count * 5); /* add first byte */ if (count > 3 || res > 0x10FFFFu || res <= limits[count]) return NULL; /* invalid byte sequence */ s += count; /* skip continuation bytes read */ } *val = res; return (const char *)s + 1; /* +1 to include first byte */ } /* ** {====================================================== ** Virtual Machine ** ======================================================= */ typedef struct Stack { const char *s; /* saved position (or NULL for calls) */ const Instruction *p; /* next instruction */ int caplevel; } Stack; #define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) /* ** Ensures the size of array 'capture' (with size '*capsize' and ** 'captop' elements being used) is enough to accomodate 'n' extra ** elements plus one. (Because several opcodes add stuff to the capture ** array, it is simpler to ensure the array always has at least one free ** slot upfront and check its size later.) */ /* new size in number of elements cannot overflow integers, and new size in bytes cannot overflow size_t. */ #define MAXNEWSIZE \ (((size_t)INT_MAX) <= (~(size_t)0 / sizeof(Capture)) ? \ ((size_t)INT_MAX) : (~(size_t)0 / sizeof(Capture))) static Capture *growcap (lua_State *L, Capture *capture, int *capsize, int captop, int n, int ptop) { if (*capsize - captop > n) return capture; /* no need to grow array */ else { /* must grow */ Capture *newc; uint newsize = captop + n + 1; /* minimum size needed */ if (newsize < (MAXNEWSIZE / 3) * 2) newsize += newsize / 2; /* 1.5 that size, if not too big */ else if (newsize < (MAXNEWSIZE / 9) * 8) newsize += newsize / 8; /* else, try 9/8 that size */ else luaL_error(L, "too many captures"); newc = (Capture *)lua_newuserdata(L, newsize * sizeof(Capture)); memcpy(newc, capture, captop * sizeof(Capture)); *capsize = newsize; lua_replace(L, caplistidx(ptop)); return newc; } } /* ** Double the size of the stack */ static Stack *doublestack (lua_State *L, Stack **stacklimit, int ptop) { Stack *stack = getstackbase(L, ptop); Stack *newstack; int n = *stacklimit - stack; /* current stack size */ int max, newn; lua_getfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); max = lua_tointeger(L, -1); /* maximum allowed size */ lua_pop(L, 1); if (n >= max) /* already at maximum size? */ luaL_error(L, "backtrack stack overflow (current limit is %d)", max); newn = 2 * n; /* new size */ if (newn > max) newn = max; newstack = (Stack *)lua_newuserdata(L, newn * sizeof(Stack)); memcpy(newstack, stack, n * sizeof(Stack)); lua_replace(L, stackidx(ptop)); *stacklimit = newstack + newn; return newstack + n; /* return next position */ } /* ** Interpret the result of a dynamic capture: false -> fail; ** true -> keep current position; number -> next position. ** Return new subject position. 'fr' is stack index where ** is the result; 'curr' is current subject position; 'limit' ** is subject's size. */ static int resdyncaptures (lua_State *L, int fr, int curr, int limit) { lua_Integer res; if (!lua_toboolean(L, fr)) { /* false value? */ lua_settop(L, fr - 1); /* remove results */ return -1; /* and fail */ } else if (lua_isboolean(L, fr)) /* true? */ res = curr; /* keep current position */ else { res = lua_tointeger(L, fr) - 1; /* new position */ if (res < curr || res > limit) luaL_error(L, "invalid position returned by match-time capture"); } lua_remove(L, fr); /* remove first result (offset) */ return res; } /* ** Add capture values returned by a dynamic capture to the list ** 'capture', nested inside a group. 'fd' indexes the first capture ** value, 'n' is the number of values (at least 1). The open group ** capture is already in 'capture', before the place for the new entries. */ static void adddyncaptures (Index_t index, Capture *capture, int n, int fd) { int i; assert(capture[-1].kind == Cgroup && capture[-1].siz == 0); capture[-1].idx = 0; /* make group capture an anonymous group */ for (i = 0; i < n; i++) { /* add runtime captures */ capture[i].kind = Cruntime; capture[i].siz = 1; /* mark it as closed */ capture[i].idx = fd + i; /* stack index of capture value */ capture[i].index = index; } capture[n].kind = Cclose; /* close group */ capture[n].siz = 1; capture[n].index = index; } /* ** Remove dynamic captures from the Lua stack (called in case of failure) */ static int removedyncap (lua_State *L, Capture *capture, int level, int last) { int id = finddyncap(capture + level, capture + last); /* index of 1st cap. */ int top = lua_gettop(L); if (id == 0) return 0; /* no dynamic captures? */ lua_settop(L, id - 1); /* remove captures */ return top - id + 1; /* number of values removed */ } /* ** Find the corresponding 'open' capture before 'cap', when that capture ** can become a full capture. If a full capture c1 is followed by an ** empty capture c2, there is no way to know whether c2 is inside ** c1. So, full captures can enclose only captures that start *before* ** its end. */ static Capture *findopen (Capture *cap, Index_t currindex) { int i; cap--; /* check last capture */ /* Must it be inside current one, but starts where current one ends? */ if (!isopencap(cap) && cap->index == currindex) return NULL; /* current one cannot be a full capture */ /* else, look for an 'open' capture */ for (i = 0; i < MAXLOP; i++, cap--) { if (currindex - cap->index >= UCHAR_MAX) return NULL; /* capture too long for a full capture */ else if (isopencap(cap)) /* open capture? */ return cap; /* that's the one to be closed */ else if (cap->kind == Cclose) return NULL; /* a full capture should not nest a non-full one */ } return NULL; /* not found within allowed search limit */ } /* ** Opcode interpreter */ const char *match (lua_State *L, const char *o, const char *s, const char *e, Instruction *op, Capture *capture, int ptop) { Stack stackbase[INITBACK]; Stack *stacklimit = stackbase + INITBACK; Stack *stack = stackbase; /* point to first empty slot in stack */ int capsize = INITCAPSIZE; int captop = 0; /* point to first empty slot in captures */ int ndyncap = 0; /* number of dynamic captures (in Lua stack) */ const Instruction *p = op; /* current instruction */ stack->p = &giveup; stack->s = s; stack->caplevel = 0; stack++; lua_pushlightuserdata(L, stackbase); for (;;) { #if defined(DEBUG) printf("-------------------------------------\n"); printcaplist(capture, capture + captop); printf("s: |%s| stck:%d, dyncaps:%d, caps:%d ", s, (int)(stack - getstackbase(L, ptop)), ndyncap, captop); printinst(op, p); #endif assert(stackidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop); switch ((Opcode)p->i.code) { case IEnd: { assert(stack == getstackbase(L, ptop) + 1); capture[captop].kind = Cclose; capture[captop].index = MAXINDT; return s; } case IGiveup: { assert(stack == getstackbase(L, ptop)); return NULL; } case IRet: { assert(stack > getstackbase(L, ptop) && (stack - 1)->s == NULL); p = (--stack)->p; continue; } case IAny: { if (s < e) { p++; s++; } else goto fail; continue; } case IUTFR: { int codepoint; if (s >= e) goto fail; s = utf8_decode (s, &codepoint); if (s && p[1].offset <= codepoint && codepoint <= utf_to(p)) p += 2; else goto fail; continue; } case ITestAny: { if (s < e) p += 2; else p += getoffset(p); continue; } case IChar: { if ((byte)*s == p->i.aux1 && s < e) { p++; s++; } else goto fail; continue; } case ITestChar: { if ((byte)*s == p->i.aux1 && s < e) p += 2; else p += getoffset(p); continue; } case ISet: { uint c = (byte)*s; if (charinset(p, (p+1)->buff, c) && s < e) { p += 1 + p->i.aux2.set.size; s++; } else goto fail; continue; } case ITestSet: { uint c = (byte)*s; if (charinset(p, (p + 2)->buff, c) && s < e) p += 2 + p->i.aux2.set.size; else p += getoffset(p); continue; } case IBehind: { int n = p->i.aux1; if (n > s - o) goto fail; s -= n; p++; continue; } case ISpan: { for (; s < e; s++) { uint c = (byte)*s; if (!charinset(p, (p+1)->buff, c)) break; } p += 1 + p->i.aux2.set.size; continue; } case IJmp: { p += getoffset(p); continue; } case IChoice: { if (stack == stacklimit) stack = doublestack(L, &stacklimit, ptop); stack->p = p + getoffset(p); stack->s = s; stack->caplevel = captop; stack++; p += 2; continue; } case ICall: { if (stack == stacklimit) stack = doublestack(L, &stacklimit, ptop); stack->s = NULL; stack->p = p + 2; /* save return address */ stack++; p += getoffset(p); continue; } case ICommit: { assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); stack--; p += getoffset(p); continue; } case IPartialCommit: { assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); (stack - 1)->s = s; (stack - 1)->caplevel = captop; p += getoffset(p); continue; } case IBackCommit: { assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); s = (--stack)->s; if (ndyncap > 0) /* are there matchtime captures? */ ndyncap -= removedyncap(L, capture, stack->caplevel, captop); captop = stack->caplevel; p += getoffset(p); continue; } case IFailTwice: assert(stack > getstackbase(L, ptop)); stack--; /* FALLTHROUGH */ case IFail: fail: { /* pattern failed: try to backtrack */ do { /* remove pending calls */ assert(stack > getstackbase(L, ptop)); s = (--stack)->s; } while (s == NULL); if (ndyncap > 0) /* is there matchtime captures? */ ndyncap -= removedyncap(L, capture, stack->caplevel, captop); captop = stack->caplevel; p = stack->p; #if defined(DEBUG) printf("**FAIL**\n"); #endif continue; } case ICloseRunTime: { CapState cs; int rem, res, n; int fr = lua_gettop(L) + 1; /* stack index of first result */ cs.reclevel = 0; cs.L = L; cs.s = o; cs.ocap = capture; cs.ptop = ptop; n = runtimecap(&cs, capture + captop, s, &rem); /* call function */ captop -= n; /* remove nested captures */ ndyncap -= rem; /* update number of dynamic captures */ fr -= rem; /* 'rem' items were popped from Lua stack */ res = resdyncaptures(L, fr, s - o, e - o); /* get result */ if (res == -1) /* fail? */ goto fail; s = o + res; /* else update current position */ n = lua_gettop(L) - fr + 1; /* number of new captures */ ndyncap += n; /* update number of dynamic captures */ if (n == 0) /* no new captures? */ captop--; /* remove open group */ else { /* new captures; keep original open group */ if (fr + n >= SHRT_MAX) luaL_error(L, "too many results in match-time capture"); /* add new captures + close group to 'capture' list */ capture = growcap(L, capture, &capsize, captop, n + 1, ptop); adddyncaptures(s - o, capture + captop, n, fr); captop += n + 1; /* new captures + close group */ } p++; continue; } case ICloseCapture: { Capture *open = findopen(capture + captop, s - o); assert(captop > 0); if (open) { /* if possible, turn capture into a full capture */ open->siz = (s - o) - open->index + 1; p++; continue; } else { /* must create a close capture */ capture[captop].siz = 1; /* mark entry as closed */ capture[captop].index = s - o; goto pushcapture; } } case IOpenCapture: capture[captop].siz = 0; /* mark entry as open */ capture[captop].index = s - o; goto pushcapture; case IFullCapture: capture[captop].siz = getoff(p) + 1; /* save capture size */ capture[captop].index = s - o - getoff(p); /* goto pushcapture; */ pushcapture: { capture[captop].idx = p->i.aux2.key; capture[captop].kind = getkind(p); captop++; capture = growcap(L, capture, &capsize, captop, 0, ptop); p++; continue; } default: assert(0); return NULL; } } } /* }====================================================== */ lpeg-1.1.0/re.html0000664000175000017500000003330014446336477013671 0ustar robertoroberto LPeg.re - Regex syntax for LPEG
LPeg.re
Regex syntax for LPEG

The re Module

The re module (provided by file re.lua in the distribution) supports a somewhat conventional regex syntax for pattern usage within LPeg.

The next table summarizes re's syntax. A p represents an arbitrary pattern; num represents a number ([0-9]+); name represents an identifier ([a-zA-Z][a-zA-Z0-9_]*). Constructions are listed in order of decreasing precedence.
SyntaxDescription
( p ) grouping
& p and predicate
! p not predicate
p1 p2 concatenation
p1 / p2 ordered choice
p ? optional match
p * zero or more repetitions
p + one or more repetitions
p^num exactly num repetitions
p^+num at least num repetitions
p^-num at most num repetitions
(name <- p)+ grammar
'string' literal string
"string" literal string
[class] character class
. any character
%name pattern defs[name] or a pre-defined pattern
namenon terminal
<name>non terminal
{} position capture
{ p } simple capture
{: p :} anonymous group capture
{:name: p :} named group capture
{~ p ~} substitution capture
{| p |} table capture
=name back reference
p -> 'string' string capture
p -> "string" string capture
p -> num numbered capture
p -> name function/query/string capture equivalent to p / defs[name]
p => name match-time capture equivalent to lpeg.Cmt(p, defs[name])
p ~> name fold capture (deprecated)
p >> name accumulator capture equivalent to (p % defs[name])

Any space appearing in a syntax description can be replaced by zero or more space characters and Lua-style short comments (-- until end of line).

Character classes define sets of characters. An initial ^ complements the resulting set. A range x-y includes in the set all characters with codes between the codes of x and y. A pre-defined class %name includes all characters of that class. A simple character includes itself in the set. The only special characters inside a class are ^ (special only if it is the first character); ] (can be included in the set as the first character, after the optional ^); % (special only if followed by a letter); and - (can be included in the set as the first or the last character).

Currently the pre-defined classes are similar to those from the Lua's string library (%a for letters, %A for non letters, etc.). There is also a class %nl containing only the newline character, which is particularly handy for grammars written inside long strings, as long strings do not interpret escape sequences like \n.

Functions

re.compile (string, [, defs])

Compiles the given string and returns an equivalent LPeg pattern. The given string may define either an expression or a grammar. The optional defs table provides extra Lua values to be used by the pattern.

re.find (subject, pattern [, init])

Searches the given pattern in the given subject. If it finds a match, returns the index where this occurrence starts and the index where it ends. Otherwise, returns nil.

An optional numeric argument init makes the search starts at that position in the subject string. As usual in Lua libraries, a negative value counts from the end.

re.gsub (subject, pattern, replacement)

Does a global substitution, replacing all occurrences of pattern in the given subject by replacement.

re.match (subject, pattern)

Matches the given pattern against the given subject, returning all captures.

re.updatelocale ()

Updates the pre-defined character classes to the current locale.

Some Examples

A complete simple program

The next code shows a simple complete Lua program using the re module:

local re = require"re"

-- find the position of the first numeral in a string
print(re.find("the number 423 is odd", "[0-9]+"))  --> 12    14

-- returns all words in a string
print(re.match("the number 423 is odd", "({%a+} / .)*"))
--> the    number    is    odd

-- returns the first numeral in a string
print(re.match("the number 423 is odd", "s <- {%d+} / . s"))
--> 423

-- substitutes a dot for each vowel in a string
print(re.gsub("hello World", "[aeiou]", "."))
--> h.ll. W.rld

Balanced parentheses

The following call will produce the same pattern produced by the Lua expression in the balanced parentheses example:

b = re.compile[[  balanced <- "(" ([^()] / balanced)* ")"  ]]

String reversal

The next example reverses a string:

rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']]
print(rev:match"0123456789")   --> 9876543210

CSV decoder

The next example replicates the CSV decoder:

record = re.compile[[
  record <- {| field (',' field)* |} (%nl / !.)
  field <- escaped / nonescaped
  nonescaped <- { [^,"%nl]* }
  escaped <- '"' {~ ([^"] / '""' -> '"')* ~} '"'
]]

Lua's long strings

The next example matches Lua long strings:

c = re.compile([[
  longstring <- ('[' {:eq: '='* :} '[' close)
  close <- ']' =eq ']' / . close
]])

print(c:match'[==[]]===]]]]==]===[]')   --> 17

Abstract Syntax Trees

This example shows a simple way to build an abstract syntax tree (AST) for a given grammar. To keep our example simple, let us consider the following grammar for lists of names:

p = re.compile[[
      listname <- (name s)*
      name <- [a-z][a-z]*
      s <- %s*
]]

Now, we will add captures to build a corresponding AST. As a first step, the pattern will build a table to represent each non terminal; terminals will be represented by their corresponding strings:

c = re.compile[[
      listname <- {| (name s)* |}
      name <- {| {[a-z][a-z]*} |}
      s <- %s*
]]

Now, a match against "hi hello bye" results in the table {{"hi"}, {"hello"}, {"bye"}}.

For such a simple grammar, this AST is more than enough; actually, the tables around each single name are already overkilling. More complex grammars, however, may need some more structure. Specifically, it would be useful if each table had a tag field telling what non terminal that table represents. We can add such a tag using named group captures:

x = re.compile[[
      listname <- {| {:tag: '' -> 'list':} (name s)* |}
      name <- {| {:tag: '' -> 'id':} {[a-z][a-z]*} |}
      s <- ' '*
]]

With these group captures, a match against "hi hello bye" results in the following table:

{tag="list",
  {tag="id", "hi"},
  {tag="id", "hello"},
  {tag="id", "bye"}
}

Indented blocks

This example breaks indented blocks into tables, respecting the indentation:

p = re.compile[[
  block <- {| {:ident:' '*:} line
           ((=ident !' ' line) / &(=ident ' ') block)* |}
  line <- {[^%nl]*} %nl
]]

As an example, consider the following text:

t = p:match[[
first line
  subline 1
  subline 2
second line
third line
  subline 3.1
    subline 3.1.1
  subline 3.2
]]

The resulting table t will be like this:

   {'first line'; {'subline 1'; 'subline 2'; ident = '  '};
    'second line';
    'third line'; { 'subline 3.1'; {'subline 3.1.1'; ident = '    '};
                    'subline 3.2'; ident = '  '};
    ident = ''}

Macro expander

This example implements a simple macro expander. Macros must be defined as part of the pattern, following some simple rules:

p = re.compile[[
      text <- {~ item* ~}
      item <- macro / [^()] / '(' item* ')'
      arg <- ' '* {~ (!',' item)* ~}
      args <- '(' arg (',' arg)* ')'
      -- now we define some macros
      macro <- ('apply' args) -> '%1(%2)'
             / ('add' args) -> '%1 + %2'
             / ('mul' args) -> '%1 * %2'
]]

print(p:match"add(mul(a,b), apply(f,x))")   --> a * b + f(x)

A text is a sequence of items, wherein we apply a substitution capture to expand any macros. An item is either a macro, any character different from parentheses, or a parenthesized expression. A macro argument (arg) is a sequence of items different from a comma. (Note that a comma may appear inside an item, e.g., inside a parenthesized expression.) Again we do a substitution capture to expand any macro in the argument before expanding the outer macro. args is a list of arguments separated by commas. Finally we define the macros. Each macro is a string substitution; it replaces the macro name and its arguments by its corresponding string, with each %n replaced by the n-th argument.

Patterns

This example shows the complete syntax of patterns accepted by re.

p = [=[

pattern         <- exp !.
exp             <- S (grammar / alternative)

alternative     <- seq ('/' S seq)*
seq             <- prefix*
prefix          <- '&' S prefix / '!' S prefix / suffix
suffix          <- primary S (([+*?]
                            / '^' [+-]? num
                            / '->' S (string / '{}' / name)
                            / '>>' S name
                            / '=>' S name) S)*

primary         <- '(' exp ')' / string / class / defined
                 / '{:' (name ':')? exp ':}'
                 / '=' name
                 / '{}'
                 / '{~' exp '~}'
                 / '{|' exp '|}'
                 / '{' exp '}'
                 / '.'
                 / name S !arrow
                 / '<' name '>'          -- old-style non terminals

grammar         <- definition+
definition      <- name S arrow exp

class           <- '[' '^'? item (!']' item)* ']'
item            <- defined / range / .
range           <- . '-' [^]]

S               <- (%s / '--' [^%nl]*)*   -- spaces and comments
name            <- [A-Za-z_][A-Za-z0-9_]*
arrow           <- '<-'
num             <- [0-9]+
string          <- '"' [^"]* '"' / "'" [^']* "'"
defined         <- '%' name

]=]

print(re.match(p, p))   -- a self description must match itself

License

This module is part of the LPeg package and shares its license.

lpeg-1.1.0/README.md0000664000175000017500000000016514446336477013657 0ustar robertoroberto# LPeg - Parsing Expression Grammars For Lua For more information, see [Lpeg](//www.inf.puc-rio.br/~roberto/lpeg/). lpeg-1.1.0/lpcset.h0000664000175000017500000000134014446336477014037 0ustar robertoroberto #if !defined(lpset_h) #define lpset_h #include "lpcset.h" #include "lpcode.h" #include "lptree.h" /* ** Extra information for the result of 'charsettype'. When result is ** IChar, 'offset' is the character. When result is ISet, 'cs' is the ** supporting bit array (with offset included), 'offset' is the offset ** (in bytes), 'size' is the size (in bytes), and 'delt' is the default ** value for bytes outside the set. */ typedef struct { const byte *cs; int offset; int size; int deflt; } charsetinfo; int tocharset (TTree *tree, Charset *cs); Opcode charsettype (const byte *cs, charsetinfo *info); byte getbytefromcharset (const charsetinfo *info, int index); void tree2cset (TTree *tree, charsetinfo *info); #endif lpeg-1.1.0/lptree.h0000664000175000017500000000462314446336477014047 0ustar robertoroberto #if !defined(lptree_h) #define lptree_h #include "lptypes.h" /* ** types of trees */ typedef enum TTag { TChar = 0, /* 'n' = char */ TSet, /* the set is encoded in 'u.set' and the next 'u.set.size' bytes */ TAny, TTrue, TFalse, TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint; 'cap' has length; 'key' has first byte; extra info is similar for end codepoint */ TRep, /* 'sib1'* */ TSeq, /* 'sib1' 'sib2' */ TChoice, /* 'sib1' / 'sib2' */ TNot, /* !'sib1' */ TAnd, /* &'sib1' */ TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ TOpenCall, /* ktable[key] is rule's key */ TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; extra info 'n' is rule's sequential number */ TXInfo, /* extra info */ TGrammar, /* 'sib1' is initial (and first) rule */ TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); ktable[key] is Lua value associated with capture; 'sib1' is capture body */ TRunTime /* run-time capture: 'key' is Lua function; 'sib1' is capture body */ } TTag; /* ** Tree trees ** The first child of a tree (if there is one) is immediately after ** the tree. A reference to a second child (ps) is its position ** relative to the position of the tree itself. */ typedef struct TTree { byte tag; byte cap; /* kind of capture (if it is a capture) */ unsigned short key; /* key in ktable for Lua data (0 if no key) */ union { int ps; /* occasional second child */ int n; /* occasional counter */ struct { byte offset; /* compact set offset (in bytes) */ byte size; /* compact set size (in bytes) */ byte deflt; /* default value */ byte bitmap[1]; /* bitmap (open array) */ } set; /* for compact sets */ } u; } TTree; /* access to charset */ #define treebuffer(t) ((t)->u.set.bitmap) /* ** A complete pattern has its tree plus, if already compiled, ** its corresponding code */ typedef struct Pattern { union Instruction *code; TTree tree[1]; } Pattern; /* number of children for each tree */ extern const byte numsiblings[]; /* access to children */ #define sib1(t) ((t) + 1) #define sib2(t) ((t) + (t)->u.ps) #endif lpeg-1.1.0/lptree.c0000664000175000017500000011475614446336477014053 0ustar robertoroberto #include #include #include #include "lua.h" #include "lauxlib.h" #include "lptypes.h" #include "lpcap.h" #include "lpcode.h" #include "lpprint.h" #include "lptree.h" #include "lpcset.h" /* number of siblings for each tree */ const byte numsiblings[] = { 0, 0, 0, /* char, set, any */ 0, 0, 0, /* true, false, utf-range */ 1, /* acc */ 2, 2, /* seq, choice */ 1, 1, /* not, and */ 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */ 1, /* behind */ 1, 1 /* capture, runtime capture */ }; static TTree *newgrammar (lua_State *L, int arg); /* ** returns a reasonable name for value at index 'idx' on the stack */ static const char *val2str (lua_State *L, int idx) { const char *k = lua_tostring(L, idx); if (k != NULL) return lua_pushfstring(L, "%s", k); else return lua_pushfstring(L, "(a %s)", luaL_typename(L, idx)); } /* ** Fix a TOpenCall into a TCall node, using table 'postable' to ** translate a key to its rule address in the tree. Raises an ** error if key does not exist. */ static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) { int n; lua_rawgeti(L, -1, t->key); /* get rule's name */ lua_gettable(L, postable); /* query name in position table */ n = lua_tonumber(L, -1); /* get (absolute) position */ lua_pop(L, 1); /* remove position */ if (n == 0) { /* no position? */ lua_rawgeti(L, -1, t->key); /* get rule's name again */ luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); } t->tag = TCall; t->u.ps = n - (t - g); /* position relative to node */ assert(sib2(t)->tag == TRule); sib2(t)->key = t->key; /* fix rule's key */ } /* ** Transform left associative constructions into right ** associative ones, for sequence and choice; that is: ** (t11 + t12) + t2 => t11 + (t12 + t2) ** (t11 * t12) * t2 => t11 * (t12 * t2) ** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) */ static void correctassociativity (TTree *tree) { TTree *t1 = sib1(tree); assert(tree->tag == TChoice || tree->tag == TSeq); while (t1->tag == tree->tag) { int n1size = tree->u.ps - 1; /* t1 == Op t11 t12 */ int n11size = t1->u.ps - 1; int n12size = n1size - n11size - 1; memmove(sib1(tree), sib1(t1), n11size * sizeof(TTree)); /* move t11 */ tree->u.ps = n11size + 1; sib2(tree)->tag = tree->tag; sib2(tree)->u.ps = n12size + 1; } } /* ** Make final adjustments in a tree. Fix open calls in tree 't', ** making them refer to their respective rules or raising appropriate ** errors (if not inside a grammar). Correct associativity of associative ** constructions (making them right associative). Assume that tree's ** ktable is at the top of the stack (for error messages). */ static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) { tailcall: switch (t->tag) { case TGrammar: /* subgrammars were already fixed */ return; case TOpenCall: { if (g != NULL) /* inside a grammar? */ fixonecall(L, postable, g, t); else { /* open call outside grammar */ lua_rawgeti(L, -1, t->key); luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); } break; } case TSeq: case TChoice: correctassociativity(t); break; } switch (numsiblings[t->tag]) { case 1: /* finalfix(L, postable, g, sib1(t)); */ t = sib1(t); goto tailcall; case 2: finalfix(L, postable, g, sib1(t)); t = sib2(t); goto tailcall; /* finalfix(L, postable, g, sib2(t)); */ default: assert(numsiblings[t->tag] == 0); break; } } /* ** {=================================================================== ** KTable manipulation ** ** - The ktable of a pattern 'p' can be shared by other patterns that ** contain 'p' and no other constants. Because of this sharing, we ** should not add elements to a 'ktable' unless it was freshly created ** for the new pattern. ** ** - The maximum index in a ktable is USHRT_MAX, because trees and ** patterns use unsigned shorts to store those indices. ** ==================================================================== */ /* ** Create a new 'ktable' to the pattern at the top of the stack. */ static void newktable (lua_State *L, int n) { lua_createtable(L, n, 0); /* create a fresh table */ lua_setuservalue(L, -2); /* set it as 'ktable' for pattern */ } /* ** Add element 'idx' to 'ktable' of pattern at the top of the stack; ** Return index of new element. ** If new element is nil, does not add it to table (as it would be ** useless) and returns 0, as ktable[0] is always nil. */ static int addtoktable (lua_State *L, int idx) { if (lua_isnil(L, idx)) /* nil value? */ return 0; else { int n; lua_getuservalue(L, -1); /* get ktable from pattern */ n = lua_rawlen(L, -1); if (n >= USHRT_MAX) luaL_error(L, "too many Lua values in pattern"); lua_pushvalue(L, idx); /* element to be added */ lua_rawseti(L, -2, ++n); lua_pop(L, 1); /* remove 'ktable' */ return n; } } /* ** Return the number of elements in the ktable at 'idx'. ** In Lua 5.2/5.3, default "environment" for patterns is nil, not ** a table. Treat it as an empty table. In Lua 5.1, assumes that ** the environment has no numeric indices (len == 0) */ static int ktablelen (lua_State *L, int idx) { if (!lua_istable(L, idx)) return 0; else return lua_rawlen(L, idx); } /* ** Concatentate the contents of table 'idx1' into table 'idx2'. ** (Assume that both indices are negative.) ** Return the original length of table 'idx2' (or 0, if no ** element was added, as there is no need to correct any index). */ static int concattable (lua_State *L, int idx1, int idx2) { int i; int n1 = ktablelen(L, idx1); int n2 = ktablelen(L, idx2); if (n1 + n2 > USHRT_MAX) luaL_error(L, "too many Lua values in pattern"); if (n1 == 0) return 0; /* nothing to correct */ for (i = 1; i <= n1; i++) { lua_rawgeti(L, idx1, i); lua_rawseti(L, idx2 - 1, n2 + i); /* correct 'idx2' */ } return n2; } /* ** When joining 'ktables', constants from one of the subpatterns must ** be renumbered; 'correctkeys' corrects their indices (adding 'n' ** to each of them) */ static void correctkeys (TTree *tree, int n) { if (n == 0) return; /* no correction? */ tailcall: switch (tree->tag) { case TOpenCall: case TCall: case TRunTime: case TRule: { if (tree->key > 0) tree->key += n; break; } case TCapture: { if (tree->key > 0 && tree->cap != Carg && tree->cap != Cnum) tree->key += n; break; } default: break; } switch (numsiblings[tree->tag]) { case 1: /* correctkeys(sib1(tree), n); */ tree = sib1(tree); goto tailcall; case 2: correctkeys(sib1(tree), n); tree = sib2(tree); goto tailcall; /* correctkeys(sib2(tree), n); */ default: assert(numsiblings[tree->tag] == 0); break; } } /* ** Join the ktables from p1 and p2 the ktable for the new pattern at the ** top of the stack, reusing them when possible. */ static void joinktables (lua_State *L, int p1, TTree *t2, int p2) { int n1, n2; lua_getuservalue(L, p1); /* get ktables */ lua_getuservalue(L, p2); n1 = ktablelen(L, -2); n2 = ktablelen(L, -1); if (n1 == 0 && n2 == 0) /* are both tables empty? */ lua_pop(L, 2); /* nothing to be done; pop tables */ else if (n2 == 0 || lp_equal(L, -2, -1)) { /* 2nd table empty or equal? */ lua_pop(L, 1); /* pop 2nd table */ lua_setuservalue(L, -2); /* set 1st ktable into new pattern */ } else if (n1 == 0) { /* first table is empty? */ lua_setuservalue(L, -3); /* set 2nd table into new pattern */ lua_pop(L, 1); /* pop 1st table */ } else { lua_createtable(L, n1 + n2, 0); /* create ktable for new pattern */ /* stack: new p; ktable p1; ktable p2; new ktable */ concattable(L, -3, -1); /* from p1 into new ktable */ concattable(L, -2, -1); /* from p2 into new ktable */ lua_setuservalue(L, -4); /* new ktable becomes 'p' environment */ lua_pop(L, 2); /* pop other ktables */ correctkeys(t2, n1); /* correction for indices from p2 */ } } /* ** copy 'ktable' of element 'idx' to new tree (on top of stack) */ static void copyktable (lua_State *L, int idx) { lua_getuservalue(L, idx); lua_setuservalue(L, -2); } /* ** merge 'ktable' from 'stree' at stack index 'idx' into 'ktable' ** from tree at the top of the stack, and correct corresponding ** tree. */ static void mergektable (lua_State *L, int idx, TTree *stree) { int n; lua_getuservalue(L, -1); /* get ktables */ lua_getuservalue(L, idx); n = concattable(L, -1, -2); lua_pop(L, 2); /* remove both ktables */ correctkeys(stree, n); } /* ** Create a new 'ktable' to the pattern at the top of the stack, adding ** all elements from pattern 'p' (if not 0) plus element 'idx' to it. ** Return index of new element. */ static int addtonewktable (lua_State *L, int p, int idx) { newktable(L, 1); if (p) mergektable(L, p, NULL); return addtoktable(L, idx); } /* }====================================================== */ /* ** {====================================================== ** Tree generation ** ======================================================= */ /* ** In 5.2, could use 'luaL_testudata'... */ static int testpattern (lua_State *L, int idx) { if (lua_touserdata(L, idx)) { /* value is a userdata? */ if (lua_getmetatable(L, idx)) { /* does it have a metatable? */ luaL_getmetatable(L, PATTERN_T); if (lua_rawequal(L, -1, -2)) { /* does it have the correct mt? */ lua_pop(L, 2); /* remove both metatables */ return 1; } } } return 0; } static Pattern *getpattern (lua_State *L, int idx) { return (Pattern *)luaL_checkudata(L, idx, PATTERN_T); } static int getsize (lua_State *L, int idx) { return (lua_rawlen(L, idx) - offsetof(Pattern, tree)) / sizeof(TTree); } static TTree *gettree (lua_State *L, int idx, int *len) { Pattern *p = getpattern(L, idx); if (len) *len = getsize(L, idx); return p->tree; } /* ** create a pattern followed by a tree with 'len' nodes. Set its ** uservalue (the 'ktable') equal to its metatable. (It could be any ** empty sequence; the metatable is at hand here, so we use it.) */ static TTree *newtree (lua_State *L, int len) { size_t size = offsetof(Pattern, tree) + len * sizeof(TTree); Pattern *p = (Pattern *)lua_newuserdata(L, size); luaL_getmetatable(L, PATTERN_T); lua_pushvalue(L, -1); lua_setuservalue(L, -3); lua_setmetatable(L, -2); p->code = NULL; return p->tree; } static TTree *newleaf (lua_State *L, int tag) { TTree *tree = newtree(L, 1); tree->tag = tag; return tree; } /* ** Create a tree for a charset, optimizing for special cases: empty set, ** full set, and singleton set. */ static TTree *newcharset (lua_State *L, byte *cs) { charsetinfo info; Opcode op = charsettype(cs, &info); switch (op) { case IFail: return newleaf(L, TFalse); /* empty set */ case IAny: return newleaf(L, TAny); /* full set */ case IChar: { /* singleton set */ TTree *tree =newleaf(L, TChar); tree->u.n = info.offset; return tree; } default: { /* regular set */ int i; int bsize = /* tree size in bytes */ (int)offsetof(TTree, u.set.bitmap) + info.size; TTree *tree = newtree(L, bytes2slots(bsize)); assert(op == ISet); tree->tag = TSet; tree->u.set.offset = info.offset; tree->u.set.size = info.size; tree->u.set.deflt = info.deflt; for (i = 0; i < info.size; i++) { assert(&treebuffer(tree)[i] < (byte*)tree + bsize); treebuffer(tree)[i] = cs[info.offset + i]; } return tree; } } } /* ** Add to tree a sequence where first sibling is 'sib' (with size ** 'sibsize'); return position for second sibling. */ static TTree *seqaux (TTree *tree, TTree *sib, int sibsize) { tree->tag = TSeq; tree->u.ps = sibsize + 1; memcpy(sib1(tree), sib, sibsize * sizeof(TTree)); return sib2(tree); } /* ** Build a sequence of 'n' nodes, each with tag 'tag' and 'u.n' got ** from the array 's' (or 0 if array is NULL). (TSeq is binary, so it ** must build a sequence of sequence of sequence...) */ static void fillseq (TTree *tree, int tag, int n, const char *s) { int i; for (i = 0; i < n - 1; i++) { /* initial n-1 copies of Seq tag; Seq ... */ tree->tag = TSeq; tree->u.ps = 2; sib1(tree)->tag = tag; sib1(tree)->u.n = s ? (byte)s[i] : 0; tree = sib2(tree); } tree->tag = tag; /* last one does not need TSeq */ tree->u.n = s ? (byte)s[i] : 0; } /* ** Numbers as patterns: ** 0 == true (always match); n == TAny repeated 'n' times; ** -n == not (TAny repeated 'n' times) */ static TTree *numtree (lua_State *L, int n) { if (n == 0) return newleaf(L, TTrue); else { TTree *tree, *nd; if (n > 0) tree = nd = newtree(L, 2 * n - 1); else { /* negative: code it as !(-n) */ n = -n; tree = newtree(L, 2 * n); tree->tag = TNot; nd = sib1(tree); } fillseq(nd, TAny, n, NULL); /* sequence of 'n' any's */ return tree; } } /* ** Convert value at index 'idx' to a pattern */ static TTree *getpatt (lua_State *L, int idx, int *len) { TTree *tree; switch (lua_type(L, idx)) { case LUA_TSTRING: { size_t slen; const char *s = lua_tolstring(L, idx, &slen); /* get string */ if (slen == 0) /* empty? */ tree = newleaf(L, TTrue); /* always match */ else { tree = newtree(L, 2 * (slen - 1) + 1); fillseq(tree, TChar, slen, s); /* sequence of 'slen' chars */ } break; } case LUA_TNUMBER: { int n = lua_tointeger(L, idx); tree = numtree(L, n); break; } case LUA_TBOOLEAN: { tree = (lua_toboolean(L, idx) ? newleaf(L, TTrue) : newleaf(L, TFalse)); break; } case LUA_TTABLE: { tree = newgrammar(L, idx); break; } case LUA_TFUNCTION: { tree = newtree(L, 2); tree->tag = TRunTime; tree->key = addtonewktable(L, 0, idx); sib1(tree)->tag = TTrue; break; } default: { return gettree(L, idx, len); } } lua_replace(L, idx); /* put new tree into 'idx' slot */ if (len) *len = getsize(L, idx); return tree; } /* ** create a new tree, whith a new root and one sibling. ** Sibling must be on the Lua stack, at index 1. */ static TTree *newroot1sib (lua_State *L, int tag) { int s1; TTree *tree1 = getpatt(L, 1, &s1); TTree *tree = newtree(L, 1 + s1); /* create new tree */ tree->tag = tag; memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); copyktable(L, 1); return tree; } /* ** create a new tree, whith a new root and 2 siblings. ** Siblings must be on the Lua stack, first one at index 1. */ static TTree *newroot2sib (lua_State *L, int tag) { int s1, s2; TTree *tree1 = getpatt(L, 1, &s1); TTree *tree2 = getpatt(L, 2, &s2); TTree *tree = newtree(L, 1 + s1 + s2); /* create new tree */ tree->tag = tag; tree->u.ps = 1 + s1; memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); memcpy(sib2(tree), tree2, s2 * sizeof(TTree)); joinktables(L, 1, sib2(tree), 2); return tree; } static int lp_P (lua_State *L) { luaL_checkany(L, 1); getpatt(L, 1, NULL); lua_settop(L, 1); return 1; } /* ** sequence operator; optimizations: ** false x => false, x true => x, true x => x ** (cannot do x . false => false because x may have runtime captures) */ static int lp_seq (lua_State *L) { TTree *tree1 = getpatt(L, 1, NULL); TTree *tree2 = getpatt(L, 2, NULL); if (tree1->tag == TFalse || tree2->tag == TTrue) lua_pushvalue(L, 1); /* false . x == false, x . true = x */ else if (tree1->tag == TTrue) lua_pushvalue(L, 2); /* true . x = x */ else newroot2sib(L, TSeq); return 1; } /* ** choice operator; optimizations: ** charset / charset => charset ** true / x => true, x / false => x, false / x => x ** (x / true is not equivalent to true) */ static int lp_choice (lua_State *L) { Charset st1, st2; TTree *t1 = getpatt(L, 1, NULL); TTree *t2 = getpatt(L, 2, NULL); if (tocharset(t1, &st1) && tocharset(t2, &st2)) { loopset(i, st1.cs[i] |= st2.cs[i]); newcharset(L, st1.cs); } else if (nofail(t1) || t2->tag == TFalse) lua_pushvalue(L, 1); /* true / x => true, x / false => x */ else if (t1->tag == TFalse) lua_pushvalue(L, 2); /* false / x => x */ else newroot2sib(L, TChoice); return 1; } /* ** p^n */ static int lp_star (lua_State *L) { int size1; int n = (int)luaL_checkinteger(L, 2); TTree *tree1 = getpatt(L, 1, &size1); if (n >= 0) { /* seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) */ TTree *tree = newtree(L, (n + 1) * (size1 + 1)); if (nullable(tree1)) luaL_error(L, "loop body may accept empty string"); while (n--) /* repeat 'n' times */ tree = seqaux(tree, tree1, size1); tree->tag = TRep; memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); } else { /* choice (seq tree1 ... choice tree1 true ...) true */ TTree *tree; n = -n; /* size = (choice + seq + tree1 + true) * n, but the last has no seq */ tree = newtree(L, n * (size1 + 3) - 1); for (; n > 1; n--) { /* repeat (n - 1) times */ tree->tag = TChoice; tree->u.ps = n * (size1 + 3) - 2; sib2(tree)->tag = TTrue; tree = sib1(tree); tree = seqaux(tree, tree1, size1); } tree->tag = TChoice; tree->u.ps = size1 + 1; sib2(tree)->tag = TTrue; memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); } copyktable(L, 1); return 1; } /* ** #p == &p */ static int lp_and (lua_State *L) { newroot1sib(L, TAnd); return 1; } /* ** -p == !p */ static int lp_not (lua_State *L) { newroot1sib(L, TNot); return 1; } /* ** [t1 - t2] == Seq (Not t2) t1 ** If t1 and t2 are charsets, make their difference. */ static int lp_sub (lua_State *L) { Charset st1, st2; int s1, s2; TTree *t1 = getpatt(L, 1, &s1); TTree *t2 = getpatt(L, 2, &s2); if (tocharset(t1, &st1) && tocharset(t2, &st2)) { loopset(i, st1.cs[i] &= ~st2.cs[i]); newcharset(L, st1.cs); } else { TTree *tree = newtree(L, 2 + s1 + s2); tree->tag = TSeq; /* sequence of... */ tree->u.ps = 2 + s2; sib1(tree)->tag = TNot; /* ...not... */ memcpy(sib1(sib1(tree)), t2, s2 * sizeof(TTree)); /* ...t2 */ memcpy(sib2(tree), t1, s1 * sizeof(TTree)); /* ... and t1 */ joinktables(L, 1, sib1(tree), 2); } return 1; } static int lp_set (lua_State *L) { size_t l; const char *s = luaL_checklstring(L, 1, &l); byte buff[CHARSETSIZE]; clearset(buff); while (l--) { setchar(buff, (byte)(*s)); s++; } newcharset(L, buff); return 1; } static int lp_range (lua_State *L) { int arg; int top = lua_gettop(L); byte buff[CHARSETSIZE]; clearset(buff); for (arg = 1; arg <= top; arg++) { int c; size_t l; const char *r = luaL_checklstring(L, arg, &l); luaL_argcheck(L, l == 2, arg, "range must have two characters"); for (c = (byte)r[0]; c <= (byte)r[1]; c++) setchar(buff, c); } newcharset(L, buff); return 1; } /* ** Fills a tree node with basic information about the UTF-8 code point ** 'cpu': its value in 'n', its length in 'cap', and its first byte in ** 'key' */ static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) { int len, fb, cp; cp = (int)cpu; if (cp <= 0x7f) { /* one byte? */ len = 1; fb = cp; } else if (cp <= 0x7ff) { len = 2; fb = 0xC0 | (cp >> 6); } else if (cp <= 0xffff) { len = 3; fb = 0xE0 | (cp >> 12); } else { luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point"); len = 4; fb = 0xF0 | (cp >> 18); } t->u.n = cp; t->cap = len; t->key = fb; } static int lp_utfr (lua_State *L) { lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1); lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); luaL_argcheck(L, from <= to, 2, "empty range"); if (to <= 0x7f) { /* ascii range? */ uint f; byte buff[CHARSETSIZE]; /* code it as a regular charset */ clearset(buff); for (f = (int)from; f <= to; f++) setchar(buff, f); newcharset(L, buff); } else { /* multi-byte utf-8 range */ TTree *tree = newtree(L, 2); tree->tag = TUTFR; codeutftree(L, tree, from, 1); sib1(tree)->tag = TXInfo; codeutftree(L, sib1(tree), to, 2); } return 1; } /* ** Look-behind predicate */ static int lp_behind (lua_State *L) { TTree *tree; TTree *tree1 = getpatt(L, 1, NULL); int n = fixedlen(tree1); luaL_argcheck(L, n >= 0, 1, "pattern may not have fixed length"); luaL_argcheck(L, !hascaptures(tree1), 1, "pattern have captures"); luaL_argcheck(L, n <= MAXBEHIND, 1, "pattern too long to look behind"); tree = newroot1sib(L, TBehind); tree->u.n = n; return 1; } /* ** Create a non-terminal */ static int lp_V (lua_State *L) { TTree *tree = newleaf(L, TOpenCall); luaL_argcheck(L, !lua_isnoneornil(L, 1), 1, "non-nil value expected"); tree->key = addtonewktable(L, 0, 1); return 1; } /* ** Create a tree for a non-empty capture, with a body and ** optionally with an associated Lua value (at index 'labelidx' in the ** stack) */ static int capture_aux (lua_State *L, int cap, int labelidx) { TTree *tree = newroot1sib(L, TCapture); tree->cap = cap; tree->key = (labelidx == 0) ? 0 : addtonewktable(L, 1, labelidx); return 1; } /* ** Fill a tree with an empty capture, using an empty (TTrue) sibling. ** (The 'key' field must be filled by the caller to finish the tree.) */ static TTree *auxemptycap (TTree *tree, int cap) { tree->tag = TCapture; tree->cap = cap; sib1(tree)->tag = TTrue; return tree; } /* ** Create a tree for an empty capture. */ static TTree *newemptycap (lua_State *L, int cap, int key) { TTree *tree = auxemptycap(newtree(L, 2), cap); tree->key = key; return tree; } /* ** Create a tree for an empty capture with an associated Lua value. */ static TTree *newemptycapkey (lua_State *L, int cap, int idx) { TTree *tree = auxemptycap(newtree(L, 2), cap); tree->key = addtonewktable(L, 0, idx); return tree; } /* ** Captures with syntax p / v ** (function capture, query capture, string capture, or number capture) */ static int lp_divcapture (lua_State *L) { switch (lua_type(L, 2)) { case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2); case LUA_TTABLE: return capture_aux(L, Cquery, 2); case LUA_TSTRING: return capture_aux(L, Cstring, 2); case LUA_TNUMBER: { int n = lua_tointeger(L, 2); TTree *tree = newroot1sib(L, TCapture); luaL_argcheck(L, 0 <= n && n <= SHRT_MAX, 1, "invalid number"); tree->cap = Cnum; tree->key = n; return 1; } default: return luaL_error(L, "unexpected %s as 2nd operand to LPeg '/'", luaL_typename(L, 2)); } } static int lp_acccapture (lua_State *L) { return capture_aux(L, Cacc, 2); } static int lp_substcapture (lua_State *L) { return capture_aux(L, Csubst, 0); } static int lp_tablecapture (lua_State *L) { return capture_aux(L, Ctable, 0); } static int lp_groupcapture (lua_State *L) { if (lua_isnoneornil(L, 2)) return capture_aux(L, Cgroup, 0); else return capture_aux(L, Cgroup, 2); } static int lp_foldcapture (lua_State *L) { luaL_checktype(L, 2, LUA_TFUNCTION); return capture_aux(L, Cfold, 2); } static int lp_simplecapture (lua_State *L) { return capture_aux(L, Csimple, 0); } static int lp_poscapture (lua_State *L) { newemptycap(L, Cposition, 0); return 1; } static int lp_argcapture (lua_State *L) { int n = (int)luaL_checkinteger(L, 1); luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index"); newemptycap(L, Carg, n); return 1; } static int lp_backref (lua_State *L) { luaL_checkany(L, 1); newemptycapkey(L, Cbackref, 1); return 1; } /* ** Constant capture */ static int lp_constcapture (lua_State *L) { int i; int n = lua_gettop(L); /* number of values */ if (n == 0) /* no values? */ newleaf(L, TTrue); /* no capture */ else if (n == 1) newemptycapkey(L, Cconst, 1); /* single constant capture */ else { /* create a group capture with all values */ TTree *tree = newtree(L, 1 + 3 * (n - 1) + 2); newktable(L, n); /* create a 'ktable' for new tree */ tree->tag = TCapture; tree->cap = Cgroup; tree->key = 0; tree = sib1(tree); for (i = 1; i <= n - 1; i++) { tree->tag = TSeq; tree->u.ps = 3; /* skip TCapture and its sibling */ auxemptycap(sib1(tree), Cconst); sib1(tree)->key = addtoktable(L, i); tree = sib2(tree); } auxemptycap(tree, Cconst); tree->key = addtoktable(L, i); } return 1; } static int lp_matchtime (lua_State *L) { TTree *tree; luaL_checktype(L, 2, LUA_TFUNCTION); tree = newroot1sib(L, TRunTime); tree->key = addtonewktable(L, 1, 2); return 1; } /* }====================================================== */ /* ** {====================================================== ** Grammar - Tree generation ** ======================================================= */ /* ** push on the stack the index and the pattern for the ** initial rule of grammar at index 'arg' in the stack; ** also add that index into position table. */ static void getfirstrule (lua_State *L, int arg, int postab) { lua_rawgeti(L, arg, 1); /* access first element */ if (lua_isstring(L, -1)) { /* is it the name of initial rule? */ lua_pushvalue(L, -1); /* duplicate it to use as key */ lua_gettable(L, arg); /* get associated rule */ } else { lua_pushinteger(L, 1); /* key for initial rule */ lua_insert(L, -2); /* put it before rule */ } if (!testpattern(L, -1)) { /* initial rule not a pattern? */ if (lua_isnil(L, -1)) luaL_error(L, "grammar has no initial rule"); else luaL_error(L, "initial rule '%s' is not a pattern", lua_tostring(L, -2)); } lua_pushvalue(L, -2); /* push key */ lua_pushinteger(L, 1); /* push rule position (after TGrammar) */ lua_settable(L, postab); /* insert pair at position table */ } /* ** traverse grammar at index 'arg', pushing all its keys and patterns ** into the stack. Create a new table (before all pairs key-pattern) to ** collect all keys and their associated positions in the final tree ** (the "position table"). ** Return the number of rules and (in 'totalsize') the total size ** for the new tree. */ static int collectrules (lua_State *L, int arg, int *totalsize) { int n = 1; /* to count number of rules */ int postab = lua_gettop(L) + 1; /* index of position table */ int size; /* accumulator for total size */ lua_newtable(L); /* create position table */ getfirstrule(L, arg, postab); size = 3 + getsize(L, postab + 2); /* TGrammar + TRule + TXInfo + rule */ lua_pushnil(L); /* prepare to traverse grammar table */ while (lua_next(L, arg) != 0) { if (lua_tonumber(L, -2) == 1 || lp_equal(L, -2, postab + 1)) { /* initial rule? */ lua_pop(L, 1); /* remove value (keep key for lua_next) */ continue; } if (!testpattern(L, -1)) /* value is not a pattern? */ luaL_error(L, "rule '%s' is not a pattern", val2str(L, -2)); luaL_checkstack(L, LUA_MINSTACK, "grammar has too many rules"); lua_pushvalue(L, -2); /* push key (to insert into position table) */ lua_pushinteger(L, size); lua_settable(L, postab); size += 2 + getsize(L, -1); /* add 'TRule + TXInfo + rule' to size */ lua_pushvalue(L, -2); /* push key (for next lua_next) */ n++; } *totalsize = size + 1; /* space for 'TTrue' finishing list of rules */ return n; } static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { int i; TTree *nd = sib1(grammar); /* auxiliary pointer to traverse the tree */ for (i = 0; i < n; i++) { /* add each rule into new tree */ int ridx = frule + 2*i + 1; /* index of i-th rule */ int rulesize; TTree *rn = gettree(L, ridx, &rulesize); TTree *pr = sib1(nd); /* points to rule's prerule */ nd->tag = TRule; nd->key = 0; /* will be fixed when rule is used */ pr->tag = TXInfo; pr->u.n = i; /* rule number */ nd->u.ps = rulesize + 2; /* point to next rule */ memcpy(sib1(pr), rn, rulesize * sizeof(TTree)); /* copy rule */ mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ nd = sib2(nd); /* move to next rule */ } nd->tag = TTrue; /* finish list of rules */ } /* ** Check whether a tree has potential infinite loops */ static int checkloops (TTree *tree) { tailcall: if (tree->tag == TRep && nullable(sib1(tree))) return 1; else if (tree->tag == TGrammar) return 0; /* sub-grammars already checked */ else { switch (numsiblings[tree->tag]) { case 1: /* return checkloops(sib1(tree)); */ tree = sib1(tree); goto tailcall; case 2: if (checkloops(sib1(tree))) return 1; /* else return checkloops(sib2(tree)); */ tree = sib2(tree); goto tailcall; default: assert(numsiblings[tree->tag] == 0); return 0; } } } /* ** Give appropriate error message for 'verifyrule'. If a rule appears ** twice in 'passed', there is path from it back to itself without ** advancing the subject. */ static int verifyerror (lua_State *L, unsigned short *passed, int npassed) { int i, j; for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ for (j = i - 1; j >= 0; j--) { if (passed[i] == passed[j]) { lua_rawgeti(L, -1, passed[i]); /* get rule's key */ return luaL_error(L, "rule '%s' may be left recursive", val2str(L, -1)); } } } return luaL_error(L, "too many left calls in grammar"); } /* ** Check whether a rule can be left recursive; raise an error in that ** case; otherwise return 1 iff pattern is nullable. ** The return value is used to check sequences, where the second pattern ** is only relevant if the first is nullable. ** Parameter 'nb' works as an accumulator, to allow tail calls in ** choices. ('nb' true makes function returns true.) ** Parameter 'passed' is a list of already visited rules, 'npassed' ** counts the elements in 'passed'. ** Assume ktable at the top of the stack. */ static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed, int npassed, int nb) { tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: case TFalse: case TUTFR: return nb; /* cannot pass from here */ case TTrue: case TBehind: /* look-behind cannot have calls */ return 1; case TNot: case TAnd: case TRep: /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ tree = sib1(tree); nb = 1; goto tailcall; case TCapture: case TRunTime: case TXInfo: /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ tree = sib1(tree); goto tailcall; case TCall: /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ tree = sib2(tree); goto tailcall; case TSeq: /* only check 2nd child if first is nb */ if (!verifyrule(L, sib1(tree), passed, npassed, 0)) return nb; /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ tree = sib2(tree); goto tailcall; case TChoice: /* must check both children */ nb = verifyrule(L, sib1(tree), passed, npassed, nb); /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ tree = sib2(tree); goto tailcall; case TRule: if (npassed >= MAXRULES) /* too many steps? */ return verifyerror(L, passed, npassed); /* error */ else { passed[npassed++] = tree->key; /* add rule to path */ /* return verifyrule(L, sib1(tree), passed, npassed); */ tree = sib1(tree); goto tailcall; } case TGrammar: return nullable(tree); /* sub-grammar cannot be left recursive */ default: assert(0); return 0; } } static void verifygrammar (lua_State *L, TTree *grammar) { unsigned short passed[MAXRULES]; TTree *rule; /* check left-recursive rules */ for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { if (rule->key == 0) continue; /* unused rule */ verifyrule(L, sib1(rule), passed, 0, 0); } assert(rule->tag == TTrue); /* check infinite loops inside rules */ for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { if (rule->key == 0) continue; /* unused rule */ if (checkloops(sib1(rule))) { lua_rawgeti(L, -1, rule->key); /* get rule's key */ luaL_error(L, "empty loop in rule '%s'", val2str(L, -1)); } } assert(rule->tag == TTrue); } /* ** Give a name for the initial rule if it is not referenced */ static void initialrulename (lua_State *L, TTree *grammar, int frule) { if (sib1(grammar)->key == 0) { /* initial rule is not referenced? */ int n = lua_rawlen(L, -1) + 1; /* index for name */ lua_pushvalue(L, frule); /* rule's name */ lua_rawseti(L, -2, n); /* ktable was on the top of the stack */ sib1(grammar)->key = n; } } static TTree *newgrammar (lua_State *L, int arg) { int treesize; int frule = lua_gettop(L) + 2; /* position of first rule's key */ int n = collectrules(L, arg, &treesize); TTree *g = newtree(L, treesize); luaL_argcheck(L, n <= MAXRULES, arg, "grammar has too many rules"); g->tag = TGrammar; g->u.n = n; lua_newtable(L); /* create 'ktable' */ lua_setuservalue(L, -2); buildgrammar(L, g, frule, n); lua_getuservalue(L, -1); /* get 'ktable' for new tree */ finalfix(L, frule - 1, g, sib1(g)); initialrulename(L, g, frule); verifygrammar(L, g); lua_pop(L, 1); /* remove 'ktable' */ lua_insert(L, -(n * 2 + 2)); /* move new table to proper position */ lua_pop(L, n * 2 + 1); /* remove position table + rule pairs */ return g; /* new table at the top of the stack */ } /* }====================================================== */ static Instruction *prepcompile (lua_State *L, Pattern *p, int idx) { lua_getuservalue(L, idx); /* push 'ktable' (may be used by 'finalfix') */ finalfix(L, 0, NULL, p->tree); lua_pop(L, 1); /* remove 'ktable' */ return compile(L, p, getsize(L, idx)); } static int lp_printtree (lua_State *L) { TTree *tree = getpatt(L, 1, NULL); int c = lua_toboolean(L, 2); if (c) { lua_getuservalue(L, 1); /* push 'ktable' (may be used by 'finalfix') */ finalfix(L, 0, NULL, tree); lua_pop(L, 1); /* remove 'ktable' */ } printktable(L, 1); printtree(tree, 0); return 0; } static int lp_printcode (lua_State *L) { Pattern *p = getpattern(L, 1); printktable(L, 1); if (p->code == NULL) /* not compiled yet? */ prepcompile(L, p, 1); printpatt(p->code); return 0; } /* ** Get the initial position for the match, interpreting negative ** values from the end of the subject */ static size_t initposition (lua_State *L, size_t len) { lua_Integer ii = luaL_optinteger(L, 3, 1); if (ii > 0) { /* positive index? */ if ((size_t)ii <= len) /* inside the string? */ return (size_t)ii - 1; /* return it (corrected to 0-base) */ else return len; /* crop at the end */ } else { /* negative index */ if ((size_t)(-ii) <= len) /* inside the string? */ return len - ((size_t)(-ii)); /* return position from the end */ else return 0; /* crop at the beginning */ } } /* ** Main match function */ static int lp_match (lua_State *L) { Capture capture[INITCAPSIZE]; const char *r; size_t l; Pattern *p = (getpatt(L, 1, NULL), getpattern(L, 1)); Instruction *code = (p->code != NULL) ? p->code : prepcompile(L, p, 1); const char *s = luaL_checklstring(L, SUBJIDX, &l); size_t i = initposition(L, l); int ptop = lua_gettop(L); luaL_argcheck(L, l < MAXINDT, SUBJIDX, "subject too long"); lua_pushnil(L); /* initialize subscache */ lua_pushlightuserdata(L, capture); /* initialize caplistidx */ lua_getuservalue(L, 1); /* initialize ktableidx */ r = match(L, s, s + i, s + l, code, capture, ptop); if (r == NULL) { lua_pushnil(L); return 1; } return getcaptures(L, s, r, ptop); } /* ** {====================================================== ** Library creation and functions not related to matching ** ======================================================= */ /* maximum limit for stack size */ #define MAXLIM (INT_MAX / 100) static int lp_setmax (lua_State *L) { lua_Integer lim = luaL_checkinteger(L, 1); luaL_argcheck(L, 0 < lim && lim <= MAXLIM, 1, "out of range"); lua_settop(L, 1); lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); return 0; } static int lp_type (lua_State *L) { if (testpattern(L, 1)) lua_pushliteral(L, "pattern"); else lua_pushnil(L); return 1; } int lp_gc (lua_State *L) { Pattern *p = getpattern(L, 1); freecode(L, p); /* delete code block */ return 0; } /* ** Create a charset representing a category of characters, given by ** the predicate 'catf'. */ static void createcat (lua_State *L, const char *catname, int (catf) (int)) { int c; byte buff[CHARSETSIZE]; clearset(buff); for (c = 0; c <= UCHAR_MAX; c++) if (catf(c)) setchar(buff, c); newcharset(L, buff); lua_setfield(L, -2, catname); } static int lp_locale (lua_State *L) { if (lua_isnoneornil(L, 1)) { lua_settop(L, 0); lua_createtable(L, 0, 12); } else { luaL_checktype(L, 1, LUA_TTABLE); lua_settop(L, 1); } createcat(L, "alnum", isalnum); createcat(L, "alpha", isalpha); createcat(L, "cntrl", iscntrl); createcat(L, "digit", isdigit); createcat(L, "graph", isgraph); createcat(L, "lower", islower); createcat(L, "print", isprint); createcat(L, "punct", ispunct); createcat(L, "space", isspace); createcat(L, "upper", isupper); createcat(L, "xdigit", isxdigit); return 1; } static struct luaL_Reg pattreg[] = { {"ptree", lp_printtree}, {"pcode", lp_printcode}, {"match", lp_match}, {"B", lp_behind}, {"V", lp_V}, {"C", lp_simplecapture}, {"Cc", lp_constcapture}, {"Cmt", lp_matchtime}, {"Cb", lp_backref}, {"Carg", lp_argcapture}, {"Cp", lp_poscapture}, {"Cs", lp_substcapture}, {"Ct", lp_tablecapture}, {"Cf", lp_foldcapture}, {"Cg", lp_groupcapture}, {"P", lp_P}, {"S", lp_set}, {"R", lp_range}, {"utfR", lp_utfr}, {"locale", lp_locale}, {"version", NULL}, {"setmaxstack", lp_setmax}, {"type", lp_type}, {NULL, NULL} }; static struct luaL_Reg metareg[] = { {"__mul", lp_seq}, {"__add", lp_choice}, {"__pow", lp_star}, {"__gc", lp_gc}, {"__len", lp_and}, {"__div", lp_divcapture}, {"__mod", lp_acccapture}, {"__unm", lp_not}, {"__sub", lp_sub}, {NULL, NULL} }; int luaopen_lpeg (lua_State *L); int luaopen_lpeg (lua_State *L) { luaL_newmetatable(L, PATTERN_T); lua_pushnumber(L, MAXBACK); /* initialize maximum backtracking */ lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); luaL_setfuncs(L, metareg, 0); luaL_newlib(L, pattreg); lua_pushvalue(L, -1); lua_setfield(L, -3, "__index"); lua_pushliteral(L, "LPeg " VERSION); lua_setfield(L, -2, "version"); return 1; } /* }====================================================== */ lpeg-1.1.0/lpcap.h0000664000175000017500000000466214446336477013656 0ustar robertoroberto #if !defined(lpcap_h) #define lpcap_h #include "lptypes.h" /* kinds of captures */ typedef enum CapKind { Cclose, /* not used in trees */ Cposition, Cconst, /* ktable[key] is Lua constant */ Cbackref, /* ktable[key] is "name" of group to get capture */ Carg, /* 'key' is arg's number */ Csimple, /* next node is pattern */ Ctable, /* next node is pattern */ Cfunction, /* ktable[key] is function; next node is pattern */ Cacc, /* ktable[key] is function; next node is pattern */ Cquery, /* ktable[key] is table; next node is pattern */ Cstring, /* ktable[key] is string; next node is pattern */ Cnum, /* numbered capture; 'key' is number of value to return */ Csubst, /* substitution capture; next node is pattern */ Cfold, /* ktable[key] is function; next node is pattern */ Cruntime, /* not used in trees (is uses another type for tree) */ Cgroup /* ktable[key] is group's "name" */ } CapKind; /* ** An unsigned integer large enough to index any subject entirely. ** It can be size_t, but that will double the size of the array ** of captures in a 64-bit machine. */ #if !defined(Index_t) typedef uint Index_t; #endif #define MAXINDT (~(Index_t)0) typedef struct Capture { Index_t index; /* subject position */ unsigned short idx; /* extra info (group name, arg index, etc.) */ byte kind; /* kind of capture */ byte siz; /* size of full capture + 1 (0 = not a full capture) */ } Capture; typedef struct CapState { Capture *cap; /* current capture */ Capture *ocap; /* (original) capture list */ lua_State *L; int ptop; /* stack index of last argument to 'match' */ int firstcap; /* stack index of first capture pushed in the stack */ const char *s; /* original string */ int valuecached; /* value stored in cache slot */ int reclevel; /* recursion level */ } CapState; #define captype(cap) ((cap)->kind) #define isclosecap(cap) (captype(cap) == Cclose) #define isopencap(cap) ((cap)->siz == 0) /* true if c2 is (any number of levels) inside c1 */ #define capinside(c1,c2) \ (isopencap(c1) ? !isclosecap(c2) \ : (c2)->index < (c1)->index + (c1)->siz - 1) /** ** Maximum number of captures to visit when looking for an 'open'. */ #define MAXLOP 20 int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); int getcaptures (lua_State *L, const char *s, const char *r, int ptop); int finddyncap (Capture *cap, Capture *last); #endif lpeg-1.1.0/HISTORY0000664000175000017500000000665514446336477013476 0ustar robertorobertoHISTORY for LPeg 1.1.0 * Changes from version 1.0.2 to 1.1.0 --------------------------------- + accumulator capture + UTF-8 ranges + Larger limit for number of rules in a grammar + Larger limit for number of captures in a match + bug fixes + other small improvements * Changes from version 1.0.1 to 1.0.2 --------------------------------- + some bugs fixed * Changes from version 0.12 to 1.0.1 --------------------------------- + group "names" can be any Lua value + some bugs fixed + other small improvements * Changes from version 0.11 to 0.12 --------------------------------- + no "unsigned short" limit for pattern sizes + mathtime captures considered nullable + some bugs fixed * Changes from version 0.10 to 0.11 ------------------------------- + complete reimplementation of the code generator + new syntax for table captures + new functions in module 're' + other small improvements * Changes from version 0.9 to 0.10 ------------------------------- + backtrack stack has configurable size + better error messages + Notation for non-terminals in 're' back to A instead o + experimental look-behind pattern + support for external extensions + works with Lua 5.2 + consumes less C stack - "and" predicates do not keep captures * Changes from version 0.8 to 0.9 ------------------------------- + The accumulator capture was replaced by a fold capture; programs that used the old 'lpeg.Ca' will need small changes. + Some support for character classes from old C locales. + A new named-group capture. * Changes from version 0.7 to 0.8 ------------------------------- + New "match-time" capture. + New "argument capture" that allows passing arguments into the pattern. + Better documentation for 're'. + Several small improvements for 're'. + The 're' module has an incompatibility with previous versions: now, any use of a non-terminal must be enclosed in angle brackets (like ). * Changes from version 0.6 to 0.7 ------------------------------- + Several improvements in module 're': - better documentation; - support for most captures (all but accumulator); - limited repetitions p{n,m}. + Small improvements in efficiency. + Several small bugs corrected (special thanks to Hans Hagen and Taco Hoekwater). * Changes from version 0.5 to 0.6 ------------------------------- + Support for non-numeric indices in grammars. + Some bug fixes (thanks to the luatex team). + Some new optimizations; (thanks to Mike Pall). + A new page layout (thanks to Andre Carregal). + Minimal documentation for module 're'. * Changes from version 0.4 to 0.5 ------------------------------- + Several optimizations. + lpeg.P now accepts booleans. + Some new examples. + A proper license. + Several small improvements. * Changes from version 0.3 to 0.4 ------------------------------- + Static check for loops in repetitions and grammars. + Removed label option in captures. + The implementation of captures uses less memory. * Changes from version 0.2 to 0.3 ------------------------------- + User-defined patterns in Lua. + Several new captures. * Changes from version 0.1 to 0.2 ------------------------------- + Several small corrections. + Handles embedded zeros like any other character. + Capture "name" can be any Lua value. + Unlimited number of captures. + Match gets an optional initial position. (end of HISTORY) lpeg-1.1.0/lpeg.html0000664000175000017500000012514114446336477014217 0ustar robertoroberto LPeg - Parsing Expression Grammars For Lua
LPeg
Parsing Expression Grammars For Lua, version 1.1

Introduction

LPeg is a new pattern-matching library for Lua, based on Parsing Expression Grammars (PEGs). This text is a reference manual for the library. For a more formal treatment of LPeg, as well as some discussion about its implementation, see A Text Pattern-Matching Tool based on Parsing Expression Grammars. (You may also be interested in my talk about LPeg given at the III Lua Workshop.)

Following the Snobol tradition, LPeg defines patterns as first-class objects. That is, patterns are regular Lua values (represented by userdata). The library offers several functions to create and compose patterns. With the use of metamethods, several of these functions are provided as infix or prefix operators. On the one hand, the result is usually much more verbose than the typical encoding of patterns using the so called regular expressions (which typically are not regular expressions in the formal sense). On the other hand, first-class patterns allow much better documentation (as it is easy to comment the code, to break complex definitions in smaller parts, etc.) and are extensible, as we can define new functions to create and compose patterns.

For a quick glance of the library, the following table summarizes its basic operations for creating patterns:

OperatorDescription
lpeg.P(string) Matches string literally
lpeg.P(n) Matches exactly n characters
lpeg.S(string) Matches any character in string (Set)
lpeg.R("xy") Matches any character between x and y (Range)
lpeg.utfR(cp1, cp2) Matches an UTF-8 code point between cp1 and cp2
patt^n Matches at least n repetitions of patt
patt^-n Matches at most n repetitions of patt
patt1 * patt2 Matches patt1 followed by patt2
patt1 + patt2 Matches patt1 or patt2 (ordered choice)
patt1 - patt2 Matches patt1 if patt2 does not match
-patt Equivalent to ("" - patt)
#patt Matches patt but consumes no input
lpeg.B(patt) Matches patt behind the current position, consuming no input

As a very simple example, lpeg.R("09")^1 creates a pattern that matches a non-empty sequence of digits. As a not so simple example, -lpeg.P(1) (which can be written as lpeg.P(-1), or simply -1 for operations expecting a pattern) matches an empty string only if it cannot match a single character; so, it succeeds only at the end of the subject.

LPeg also offers the re module, which implements patterns following a regular-expression style (e.g., [09]+). (This module is 270 lines of Lua code, and of course it uses LPeg to parse regular expressions and translate them to regular LPeg patterns.)

Functions

lpeg.match (pattern, subject [, init])

The matching function. It attempts to match the given pattern against the subject string. If the match succeeds, returns the index in the subject of the first character after the match, or the captured values (if the pattern captured any value).

An optional numeric argument init makes the match start at that position in the subject string. As in the Lua standard libraries, a negative value counts from the end.

Unlike typical pattern-matching functions, match works only in anchored mode; that is, it tries to match the pattern with a prefix of the given subject string (at position init), not with an arbitrary substring of the subject. So, if we want to find a pattern anywhere in a string, we must either write a loop in Lua or write a pattern that matches anywhere. This second approach is easy and quite efficient; see examples.

lpeg.type (value)

If the given value is a pattern, returns the string "pattern". Otherwise returns nil.

lpeg.version

A string (not a function) with the running version of LPeg.

lpeg.setmaxstack (max)

Sets a limit for the size of the backtrack stack used by LPeg to track calls and choices. (The default limit is 400.) Most well-written patterns need little backtrack levels and therefore you seldom need to change this limit; before changing it you should try to rewrite your pattern to avoid the need for extra space. Nevertheless, a few useful patterns may overflow. Also, with recursive grammars, subjects with deep recursion may also need larger limits.

Basic Constructions

The following operations build patterns. All operations that expect a pattern as an argument may receive also strings, tables, numbers, booleans, or functions, which are translated to patterns according to the rules of function lpeg.P.

lpeg.P (value)

Converts the given value into a proper pattern, according to the following rules:

  • If the argument is a pattern, it is returned unmodified.

  • If the argument is a string, it is translated to a pattern that matches the string literally.

  • If the argument is a non-negative number n, the result is a pattern that matches exactly n characters.

  • If the argument is a negative number -n, the result is a pattern that succeeds only if the input string has less than n characters left: lpeg.P(-n) is equivalent to -lpeg.P(n) (see the unary minus operation).

  • If the argument is a boolean, the result is a pattern that always succeeds or always fails (according to the boolean value), without consuming any input.

  • If the argument is a table, it is interpreted as a grammar (see Grammars).

  • If the argument is a function, returns a pattern equivalent to a match-time capture over the empty string.

lpeg.B(patt)

Returns a pattern that matches only if the input string at the current position is preceded by patt. Pattern patt must match only strings with some fixed length, and it cannot contain captures.

Like the and predicate, this pattern never consumes any input, independently of success or failure.

lpeg.R ({range})

Returns a pattern that matches any single character belonging to one of the given ranges. Each range is a string xy of length 2, representing all characters with code between the codes of x and y (both inclusive).

As an example, the pattern lpeg.R("09") matches any digit, and lpeg.R("az", "AZ") matches any ASCII letter.

lpeg.S (string)

Returns a pattern that matches any single character that appears in the given string. (The S stands for Set.)

As an example, the pattern lpeg.S("+-*/") matches any arithmetic operator.

Note that, if s is a character (that is, a string of length 1), then lpeg.P(s) is equivalent to lpeg.S(s) which is equivalent to lpeg.R(s..s). Note also that both lpeg.S("") and lpeg.R() are patterns that always fail.

lpeg.utfR (cp1, cp2)

Returns a pattern that matches a valid UTF-8 byte sequence representing a code point in the range [cp1, cp2]. The range is limited by the natural Unicode limit of 0x10FFFF, but may include surrogates.

lpeg.V (v)

This operation creates a non-terminal (a variable) for a grammar. The created non-terminal refers to the rule indexed by v in the enclosing grammar. (See Grammars for details.)

lpeg.locale ([table])

Returns a table with patterns for matching some character classes according to the current locale. The table has fields named alnum, alpha, cntrl, digit, graph, lower, print, punct, space, upper, and xdigit, each one containing a correspondent pattern. Each pattern matches any single character that belongs to its class.

If called with an argument table, then it creates those fields inside the given table and returns that table.

#patt

Returns a pattern that matches only if the input string matches patt, but without consuming any input, independently of success or failure. (This pattern is called an and predicate and it is equivalent to &patt in the original PEG notation.)

This pattern never produces any capture.

-patt

Returns a pattern that matches only if the input string does not match patt. It does not consume any input, independently of success or failure. (This pattern is equivalent to !patt in the original PEG notation.)

As an example, the pattern -lpeg.P(1) matches only the end of string.

This pattern never produces any captures, because either patt fails or -patt fails. (A failing pattern never produces captures.)

patt1 + patt2

Returns a pattern equivalent to an ordered choice of patt1 and patt2. (This is denoted by patt1 / patt2 in the original PEG notation, not to be confused with the / operation in LPeg.) It matches either patt1 or patt2, with no backtracking once one of them succeeds. The identity element for this operation is the pattern lpeg.P(false), which always fails.

If both patt1 and patt2 are character sets, this operation is equivalent to set union.

lower = lpeg.R("az")
upper = lpeg.R("AZ")
letter = lower + upper

patt1 - patt2

Returns a pattern equivalent to !patt2 patt1 in the origial PEG notation. This pattern asserts that the input does not match patt2 and then matches patt1.

When successful, this pattern produces all captures from patt1. It never produces any capture from patt2 (as either patt2 fails or patt1 - patt2 fails).

If both patt1 and patt2 are character sets, this operation is equivalent to set difference. Note that -patt is equivalent to "" - patt (or 0 - patt). If patt is a character set, 1 - patt is its complement.

patt1 * patt2

Returns a pattern that matches patt1 and then matches patt2, starting where patt1 finished. The identity element for this operation is the pattern lpeg.P(true), which always succeeds.

(LPeg uses the * operator [instead of the more obvious ..] both because it has the right priority and because in formal languages it is common to use a dot for denoting concatenation.)

patt^n

If n is nonnegative, this pattern is equivalent to pattn patt*: It matches n or more occurrences of patt.

Otherwise, when n is negative, this pattern is equivalent to (patt?)-n: It matches at most |n| occurrences of patt.

In particular, patt^0 is equivalent to patt*, patt^1 is equivalent to patt+, and patt^-1 is equivalent to patt? in the original PEG notation.

In all cases, the resulting pattern is greedy with no backtracking (also called a possessive repetition). That is, it matches only the longest possible sequence of matches for patt.

Grammars

With the use of Lua variables, it is possible to define patterns incrementally, with each new pattern using previously defined ones. However, this technique does not allow the definition of recursive patterns. For recursive patterns, we need real grammars.

LPeg represents grammars with tables, where each entry is a rule.

The call lpeg.V(v) creates a pattern that represents the nonterminal (or variable) with index v in a grammar. Because the grammar still does not exist when this function is evaluated, the result is an open reference to the respective rule.

A table is fixed when it is converted to a pattern (either by calling lpeg.P or by using it wherein a pattern is expected). Then every open reference created by lpeg.V(v) is corrected to refer to the rule indexed by v in the table.

When a table is fixed, the result is a pattern that matches its initial rule. The entry with index 1 in the table defines its initial rule. If that entry is a string, it is assumed to be the name of the initial rule. Otherwise, LPeg assumes that the entry 1 itself is the initial rule.

As an example, the following grammar matches strings of a's and b's that have the same number of a's and b's:

equalcount = lpeg.P{
  "S";   -- initial rule name
  S = "a" * lpeg.V"B" + "b" * lpeg.V"A" + "",
  A = "a" * lpeg.V"S" + "b" * lpeg.V"A" * lpeg.V"A",
  B = "b" * lpeg.V"S" + "a" * lpeg.V"B" * lpeg.V"B",
} * -1

It is equivalent to the following grammar in standard PEG notation:

  S <- 'a' B / 'b' A / ''
  A <- 'a' S / 'b' A A
  B <- 'b' S / 'a' B B

Captures

A capture is a pattern that produces values (the so called semantic information) according to what it matches. LPeg offers several kinds of captures, which produces values based on matches and combine these values to produce new values. Each capture may produce zero or more values.

The following table summarizes the basic captures:

OperationWhat it Produces
lpeg.C(patt) the match for patt plus all captures made by patt
lpeg.Carg(n) the value of the nth extra argument to lpeg.match (matches the empty string)
lpeg.Cb(key) the values produced by the previous group capture named key (matches the empty string)
lpeg.Cc(values) the given values (matches the empty string)
lpeg.Cf(patt, func) folding capture (deprecated)
lpeg.Cg(patt [, key]) the values produced by patt, optionally tagged with key
lpeg.Cp() the current position (matches the empty string)
lpeg.Cs(patt) the match for patt with the values from nested captures replacing their matches
lpeg.Ct(patt) a table with all captures from patt
patt / string string, with some marks replaced by captures of patt
patt / number the n-th value captured by patt, or no value when number is zero.
patt / table table[c], where c is the (first) capture of patt
patt / function the returns of function applied to the captures of patt
patt % function produces no value; it accummulates the captures from patt into the previous capture through function
lpeg.Cmt(patt, function) the returns of function applied to the captures of patt; the application is done at match time

A capture pattern produces its values only when it succeeds. For instance, the pattern lpeg.C(lpeg.P"a"^-1) produces the empty string when there is no "a" (because the pattern "a"? succeeds), while the pattern lpeg.C("a")^-1 does not produce any value when there is no "a" (because the pattern "a" fails). A pattern inside a loop or inside a recursive structure produces values for each match.

Usually, LPeg does not specify when (and if) it evaluates its captures. (As an example, consider the pattern lpeg.P"a" / func / 0. Because the "division" by 0 instructs LPeg to throw away the results from the pattern, it is not specified whether LPeg will call func.) Therefore, captures should avoid side effects. Moreover, captures cannot affect the way a pattern matches a subject. The only exception to this rule is the so-called match-time capture. When a match-time capture matches, it forces the immediate evaluation of all its nested captures and then calls its corresponding function, which defines whether the match succeeds and also what values are produced.

lpeg.C (patt)

Creates a simple capture, which captures the substring of the subject that matches patt. The captured value is a string. If patt has other captures, their values are returned after this one.

lpeg.Carg (n)

Creates an argument capture. This pattern matches the empty string and produces the value given as the nth extra argument given in the call to lpeg.match.

lpeg.Cb (key)

Creates a back capture. This pattern matches the empty string and produces the values produced by the most recent group capture named key (where key can be any Lua value).

Most recent means the last complete outermost group capture with the given key. A Complete capture means that the entire pattern corresponding to the capture has matched; in other words, the back capture is not nested inside the group. An Outermost capture means that the capture is not inside another complete capture that does not contain the back capture itself.

In the same way that LPeg does not specify when it evaluates captures, it does not specify whether it reuses values previously produced by the group or re-evaluates them.

lpeg.Cc ([value, ...])

Creates a constant capture. This pattern matches the empty string and produces all given values as its captured values.

lpeg.Cf (patt, func)

Creates a fold capture. This construction is deprecated; use an accumulator pattern instead. In general, a fold like lpeg.Cf(p1 * p2^0, func) can be translated to (p1 * (p2 % func)^0).

lpeg.Cg (patt [, key])

Creates a group capture. It groups all values returned by patt into a single capture. The group may be anonymous (if no key is given) or named with the given key (which can be any non-nil Lua value).

An anonymous group serves to join values from several captures into a single capture. A named group has a different behavior. In most situations, a named group returns no values at all. Its values are only relevant for a following back capture or when used inside a table capture.

lpeg.Cp ()

Creates a position capture. It matches the empty string and captures the position in the subject where the match occurs. The captured value is a number.

lpeg.Cs (patt)

Creates a substitution capture, which captures the substring of the subject that matches patt, with substitutions. For any capture inside patt with a value, the substring that matched the capture is replaced by the capture value (which should be a string). The final captured value is the string resulting from all replacements.

lpeg.Ct (patt)

Creates a table capture. This capture returns a table with all values from all anonymous captures made by patt inside this table in successive integer keys, starting at 1. Moreover, for each named capture group created by patt, the first value of the group is put into the table with the group key as its key. The captured value is only the table.

patt / string

Creates a string capture. It creates a capture string based on string. The captured value is a copy of string, except that the character % works as an escape character: any sequence in string of the form %n, with n between 1 and 9, stands for the match of the n-th capture in patt. The sequence %0 stands for the whole match. The sequence %% stands for a single %.

patt / number

Creates a numbered capture. For a non-zero number, the captured value is the n-th value captured by patt. When number is zero, there are no captured values.

patt / table

Creates a query capture. It indexes the given table using as key the first value captured by patt, or the whole match if patt produced no value. The value at that index is the final value of the capture. If the table does not have that key, there is no captured value.

patt / function

Creates a function capture. It calls the given function passing all captures made by patt as arguments, or the whole match if patt made no capture. The values returned by the function are the final values of the capture. In particular, if function returns no value, there is no captured value.

patt % function

Creates an accumulator capture. This pattern behaves similarly to a function capture, with the following differences: The last captured value before patt is added as a first argument to the call; the return of the function is adjusted to one single value; that value replaces the last captured value. Note that the capture itself produces no values; it only changes the value of its previous capture.

As an example, let us consider the problem of adding a list of numbers.

-- matches a numeral and captures its numerical value
number = lpeg.R"09"^1 / tonumber

-- auxiliary function to add two numbers
function add (acc, newvalue) return acc + newvalue end

-- matches a list of numbers, adding their values
sum = number * ("," * number % add)^0

-- example of use
print(sum:match("10,30,43"))   --> 83

First, the initial number captures a number; that first capture will play the role of an accumulator. Then, each time the sequence comma-number matches inside the loop there is an accumulator capture: It calls add with the current value of the accumulator—which is the last captured value, created by the first number— and the value of the new number, and the result of the call (the sum of the two numbers) replaces the value of the accumulator. At the end of the match, the accumulator with all sums is the final value.

As another example, consider the following code fragment:

local name = lpeg.C(lpeg.R("az")^1)
local p = name * (lpeg.P("^") % string.upper)^-1
print(p:match("count"))    --> count
print(p:match("count^"))   --> COUNT

In the match against "count", as there is no "^", the optional accumulator capture does not match; so, the match results in its sole capture, a name. In the match against "count^", the accumulator capture matches, so the function string.upper is called with the previous captured value (created by name) plus the string "^"; the function ignores its second argument and returns the first argument changed to upper case; that value then becomes the first and only capture value created by the match.

Due to the nature of this capture, you should avoid using it in places where it is not clear what is the "previous" capture, such as directly nested in a string capture or a numbered capture. (Note that these captures may not need to evaluate all their subcaptures to compute their results.) Moreover, due to implementation details, you should not use this capture directly nested in a substitution capture. You should also avoid a direct nesting of this capture inside a folding capture (deprecated), as the folding will try to fold each individual accumulator capture. A simple and effective way to avoid all these issues is to enclose the whole accumulation composition (including the capture that generates the initial value) into an anonymous group capture.

lpeg.Cmt(patt, function)

Creates a match-time capture. Unlike all other captures, this one is evaluated immediately when a match occurs (even if it is part of a larger pattern that fails later). It forces the immediate evaluation of all its nested captures and then calls function.

The given function gets as arguments the entire subject, the current position (after the match of patt), plus any capture values produced by patt.

The first value returned by function defines how the match happens. If the call returns a number, the match succeeds and the returned number becomes the new current position. (Assuming a subject s and current position i, the returned number must be in the range [i, len(s) + 1].) If the call returns true, the match succeeds without consuming any input. (So, to return true is equivalent to return i.) If the call returns false, nil, or no value, the match fails.

Any extra values returned by the function become the values produced by the capture.

Some Examples

Using a Pattern

This example shows a very simple but complete program that builds and uses a pattern:

local lpeg = require "lpeg"

-- matches a word followed by end-of-string
p = lpeg.R"az"^1 * -1

print(p:match("hello"))        --> 6
print(lpeg.match(p, "hello"))  --> 6
print(p:match("1 hello"))      --> nil

The pattern is simply a sequence of one or more lower-case letters followed by the end of string (-1). The program calls match both as a method and as a function. In both sucessful cases, the match returns the index of the first character after the match, which is the string length plus one.

Name-value lists

This example parses a list of name-value pairs and returns a table with those pairs:

lpeg.locale(lpeg)   -- adds locale entries into 'lpeg' table

local space = lpeg.space^0
local name = lpeg.C(lpeg.alpha^1) * space
local sep = lpeg.S(",;") * space
local pair = name * "=" * space * name * sep^-1
local list = lpeg.Ct("") * (pair % rawset)^0
t = list:match("a=b, c = hi; next = pi")
        --> { a = "b", c = "hi", next = "pi" }

Each pair has the format name = name followed by an optional separator (a comma or a semicolon). The list pattern then folds these captures. It starts with an empty table, created by a table capture matching an empty string; then for each a pair of names it applies rawset over the accumulator (the table) and the capture values (the pair of names). rawset returns the table itself, so the accumulator is always the table.

Splitting a string

The following code builds a pattern that splits a string using a given pattern sep as a separator:

function split (s, sep)
  sep = lpeg.P(sep)
  local elem = lpeg.C((1 - sep)^0)
  local p = elem * (sep * elem)^0
  return lpeg.match(p, s)
end

First the function ensures that sep is a proper pattern. The pattern elem is a repetition of zero of more arbitrary characters as long as there is not a match against the separator. It also captures its match. The pattern p matches a list of elements separated by sep.

If the split results in too many values, it may overflow the maximum number of values that can be returned by a Lua function. To avoid this problem, we can collect these values in a table:

function split (s, sep)
  sep = lpeg.P(sep)
  local elem = lpeg.C((1 - sep)^0)
  local p = lpeg.Ct(elem * (sep * elem)^0)   -- make a table capture
  return lpeg.match(p, s)
end

Searching for a pattern

The primitive match works only in anchored mode. If we want to find a pattern anywhere in a string, we must write a pattern that matches anywhere.

Because patterns are composable, we can write a function that, given any arbitrary pattern p, returns a new pattern that searches for p anywhere in a string. There are several ways to do the search. One way is like this:

function anywhere (p)
  return lpeg.P{ p + 1 * lpeg.V(1) }
end

This grammar has a straight reading: its sole rule matches p or skips one character and tries again.

If we want to know where the pattern is in the string (instead of knowing only that it is there somewhere), we can add position captures to the pattern:

local Cp = lpeg.Cp()
function anywhere (p)
  return lpeg.P{ Cp * p * Cp + 1 * lpeg.V(1) }
end

print(anywhere("world"):match("hello world!"))   --> 7   12

Another option for the search is like this:

local Cp = lpeg.Cp()
function anywhere (p)
  return (1 - lpeg.P(p))^0 * Cp * p * Cp
end

Again the pattern has a straight reading: it skips as many characters as possible while not matching p, and then matches p plus appropriate captures.

If we want to look for a pattern only at word boundaries, we can use the following transformer:

local t = lpeg.locale()

function atwordboundary (p)
  return lpeg.P{
    [1] = p + t.alpha^0 * (1 - t.alpha)^1 * lpeg.V(1)
  }
end

Balanced parentheses

The following pattern matches only strings with balanced parentheses:

b = lpeg.P{ "(" * ((1 - lpeg.S"()") + lpeg.V(1))^0 * ")" }

Reading the first (and only) rule of the given grammar, we have that a balanced string is an open parenthesis, followed by zero or more repetitions of either a non-parenthesis character or a balanced string (lpeg.V(1)), followed by a closing parenthesis.

Global substitution

The next example does a job somewhat similar to string.gsub. It receives a pattern and a replacement value, and substitutes the replacement value for all occurrences of the pattern in a given string:

function gsub (s, patt, repl)
  patt = lpeg.P(patt)
  patt = lpeg.Cs((patt / repl + 1)^0)
  return lpeg.match(patt, s)
end

As in string.gsub, the replacement value can be a string, a function, or a table.

Comma-Separated Values (CSV)

This example breaks a string into comma-separated values, returning all fields:

local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
                    lpeg.C((1 - lpeg.S',\n"')^0)

local record = field * (',' * field)^0 * (lpeg.P'\n' + -1)

function csv (s)
  return lpeg.match(record, s)
end

A field is either a quoted field (which may contain any character except an individual quote, which may be written as two quotes that are replaced by one) or an unquoted field (which cannot contain commas, newlines, or quotes). A record is a list of fields separated by commas, ending with a newline or the string end (-1).

As it is, the previous pattern returns each field as a separated result. If we add a table capture in the definition of record, the pattern will return instead a single table containing all fields:

local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1)

Lua's long strings

A long string in Lua starts with the pattern [=*[ and ends at the first occurrence of ]=*] with exactly the same number of equal signs. If the opening brackets are followed by a newline, this newline is discarded (that is, it is not part of the string).

To match a long string in Lua, the pattern must capture the first repetition of equal signs and then, whenever it finds a candidate for closing the string, check whether it has the same number of equal signs.

equals = lpeg.P"="^0
open = "[" * lpeg.Cg(equals, "init") * "[" * lpeg.P"\n"^-1
close = "]" * lpeg.C(equals) * "]"
closeeq = lpeg.Cmt(close * lpeg.Cb("init"), function (s, i, a, b) return a == b end)
string = open * lpeg.C((lpeg.P(1) - closeeq)^0) * close / 1

The open pattern matches [=*[, capturing the repetitions of equal signs in a group named init; it also discharges an optional newline, if present. The close pattern matches ]=*], also capturing the repetitions of equal signs. The closeeq pattern first matches close; then it uses a back capture to recover the capture made by the previous open, which is named init; finally it uses a match-time capture to check whether both captures are equal. The string pattern starts with an open, then it goes as far as possible until matching closeeq, and then matches the final close. The final numbered capture simply discards the capture made by close.

Arithmetic expressions

This example is a complete parser and evaluator for simple arithmetic expressions. We write it in two styles. The first approach first builds a syntax tree and then traverses this tree to compute the expression value:

-- Lexical Elements
local Space = lpeg.S(" \n\t")^0
local Number = lpeg.C(lpeg.P"-"^-1 * lpeg.R("09")^1) * Space
local TermOp = lpeg.C(lpeg.S("+-")) * Space
local FactorOp = lpeg.C(lpeg.S("*/")) * Space
local Open = "(" * Space
local Close = ")" * Space

-- Grammar
local Exp, Term, Factor = lpeg.V"Exp", lpeg.V"Term", lpeg.V"Factor"
G = lpeg.P{ Exp,
  Exp = lpeg.Ct(Term * (TermOp * Term)^0);
  Term = lpeg.Ct(Factor * (FactorOp * Factor)^0);
  Factor = Number + Open * Exp * Close;
}

G = Space * G * -1

-- Evaluator
function eval (x)
  if type(x) == "string" then
    return tonumber(x)
  else
    local op1 = eval(x[1])
    for i = 2, #x, 2 do
      local op = x[i]
      local op2 = eval(x[i + 1])
      if (op == "+") then op1 = op1 + op2
      elseif (op == "-") then op1 = op1 - op2
      elseif (op == "*") then op1 = op1 * op2
      elseif (op == "/") then op1 = op1 / op2
      end
    end
    return op1
  end
end

-- Parser/Evaluator
function evalExp (s)
  local t = lpeg.match(G, s)
  if not t then error("syntax error", 2) end
  return eval(t)
end

-- small example
print(evalExp"3 + 5*9 / (1+1) - 12")   --> 13.5

The second style computes the expression value on the fly, without building the syntax tree. The following grammar takes this approach. (It assumes the same lexical elements as before.)

-- Auxiliary function
function eval (v1, op, v2)
  if (op == "+") then return v1 + v2
  elseif (op == "-") then return v1 - v2
  elseif (op == "*") then return v1 * v2
  elseif (op == "/") then return v1 / v2
  end
end

-- Grammar
local V = lpeg.V
G = lpeg.P{ "Exp",
  Exp = V"Term" * (TermOp * V"Term" % eval)^0;
  Term = V"Factor" * (FactorOp * V"Factor" % eval)^0;
  Factor = Number / tonumber + Open * V"Exp" * Close;
}

-- small example
print(lpeg.match(G, "3 + 5*9 / (1+1) - 12"))   --> 13.5

Note the use of the accumulator capture. To compute the value of an expression, the accumulator starts with the value of the first term, and then applies eval over the accumulator, the operator, and the new term for each repetition.

Download

LPeg source code.

Probably, the easiest way to install LPeg is with LuaRocks. If you have LuaRocks installed, the following command is all you need to install LPeg:

$ luarocks install lpeg

License

Copyright © 2007-2023 Lua.org, PUC-Rio.

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

lpeg-1.1.0/lpeg-128.gif0000664000175000017500000001147314446336477014332 0ustar robertorobertoGIF89a    %(!&!&#'$('*2*,.4+./0022-7446488;:=;?=A>;B!+ԬWH tk r嫵6c$*HG6[cF11WJua bu ɉEsa&}$DR:SH KuB>'"b:uY0R 6 :o ْvckXk=O%6aLe"Ё>I6TW>x{*?؃'ECy_R]q;e=6?#]ZpOia:vC !^ - CtAH+Tzܲ>SrL8\bƥX2rgtUBNDD amCА!^%v3/;zzaz: GpJ;sE㵶e|ڛL>JѮŮU?; ^[o `4[`/'&Hc jrI^F뙰7A#3˥fjwApū2} I^ >$A-_ŀ9 *w]NAIAXC«E-OL6ᜳyN?DΤ)<4H+.~AtcഇE49k8 @Ҿ[^C59( ,XHPx#ߎV@JVn SG.@0H)aUKB!oUcSdH4@˩NM>QUGJ4D&ٽAe?]PG=|qaBN !oP!e:"?vsv$R0eKx$r5縷U "~ib| X(prsdr}<ٝBAT#ሀ%2:DQvpR8aBΤdDhH'!l$㵸 @f*0ɐvH2 ɇ%8yh tM&}Q1X)19RlxjfAZt@:=ikN2|tb 'ϰ@T1 &:@NvT *2|t< BCMx䴜R1vhH=( 0Ebaveg>IsPQ"ĕOGH~H+(3}Dv@b'bS]Vh釬YX G, e AVc!3-%ձiPO='Bq CF$b$ X%b5 f 6a e.-lu]Į\71";Tk0|P &\Tۀr%8xbWG&}%Ov7Q}g*^ @~ʏqH[&XKY"8?KР~" ,@A?`I0 (^o kR @(1X8f+b|_+ 1{hI!XxN/s 1_rx2b#Dp;|i'9f;(fqE8صr0;TBn P{B1vx&nz2uţ]یn?ϨZ РD.j`XøRӒC>NJ Z`jHY虽I;A+cXن;an هu_ɴ9x4vxNMJa Yc48I|dcjř!5ch4>6r ME"&B<@zL|z l ` bR$3KrѺ=d8EC8vvGpSoF%^̽1S=1(ȘxR4Xef +LnX`S|G>6~{9?*Fӳ88ogea|4a| '0*շ 0bAPEc@b%[pMj'V6SepHaYBz4 D|s;cw&5|0za &D$@_0P}<M ų7@.z&/58@a)4 evv0 fw K4xv"p>0 ްl@]_ʡ_w%w-,Huz<4edI ~`0ġeQl=Dw4U6Y8D5p1`ȠD@ _؀Svy9k$X6#TjUA)8Pt FJ;&s|v7]<0` XҴOE1|q MRX#@\4 Ix &`@ e= POAXבcCXuA 5DQy P@ pxIxܐ0qb^[{Bou$I{1 b,x)cwezD E頕pQ0 q0tp PJ  SAQ B T_#$n@Ax4HDP+59p 9(7 ٟ%7 4rX,J0gC E-ғ5%Su|0Gp ;/* b 9uàI s39GuQY.H 2LKROOv2= ;m$jRjŖX)\` Kjl2}WFI68h{UuǤcu 2Q \З{cW5T*$4` oڨKo7vW} rZé#CУ K .a&prP$mu:*|ؐo' ÔKZU<ڢߖ@&X^4:`aI>t85( [gS`}3 k7q =,bفo_XU{ƩPh`0 4qA"KKD1:` W5UZaAq T2*Wt(JS+ q t ]  *2,qK @ LDjZ4Cxp3sf M90 I0b!{h;Ȋ T` ,4QW6PER!PH i`=" 0 0JkQazpvFeu*Q VA!{! M mfLj DbKv$=ۿv `PP-O [AΙME R(<;%l% BeQ@  Vน){Ry3  1F P@ =[J w!*8|ճMw. 0nzv{Ȃ<Ȅ\Ȇ|ȈȊȌ;lpeg-1.1.0/lpprint.c0000664000175000017500000001577114446336477014245 0ustar robertoroberto #include #include #include #include "lptypes.h" #include "lpprint.h" #include "lpcode.h" #if defined(LPEG_DEBUG) /* ** {====================================================== ** Printing patterns (for debugging) ** ======================================================= */ void printcharset (const byte *st) { int i; printf("["); for (i = 0; i <= UCHAR_MAX; i++) { int first = i; while (i <= UCHAR_MAX && testchar(st, i)) i++; if (i - 1 == first) /* unary range? */ printf("(%02x)", first); else if (i - 1 > first) /* non-empty range? */ printf("(%02x-%02x)", first, i - 1); } printf("]"); } static void printIcharset (const Instruction *inst, const byte *buff) { byte cs[CHARSETSIZE]; int i; printf("(%02x-%d) ", inst->i.aux2.set.offset, inst->i.aux2.set.size); clearset(cs); for (i = 0; i < CHARSETSIZE * 8; i++) { if (charinset(inst, buff, i)) setchar(cs, i); } printcharset(cs); } static void printTcharset (TTree *tree) { byte cs[CHARSETSIZE]; int i; printf("(%02x-%d) ", tree->u.set.offset, tree->u.set.size); fillset(cs, tree->u.set.deflt); for (i = 0; i < tree->u.set.size; i++) cs[tree->u.set.offset + i] = treebuffer(tree)[i]; printcharset(cs); } static const char *capkind (int kind) { const char *const modes[] = { "close", "position", "constant", "backref", "argument", "simple", "table", "function", "accumulator", "query", "string", "num", "substitution", "fold", "runtime", "group"}; return modes[kind]; } static void printjmp (const Instruction *op, const Instruction *p) { printf("-> %d", (int)(p + (p + 1)->offset - op)); } void printinst (const Instruction *op, const Instruction *p) { const char *const names[] = { "any", "char", "set", "testany", "testchar", "testset", "span", "utf-range", "behind", "ret", "end", "choice", "jmp", "call", "open_call", "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", "fullcapture", "opencapture", "closecapture", "closeruntime", "--" }; printf("%02ld: %s ", (long)(p - op), names[p->i.code]); switch ((Opcode)p->i.code) { case IChar: { printf("'%c' (%02x)", p->i.aux1, p->i.aux1); break; } case ITestChar: { printf("'%c' (%02x)", p->i.aux1, p->i.aux1); printjmp(op, p); break; } case IUTFR: { printf("%d - %d", p[1].offset, utf_to(p)); break; } case IFullCapture: { printf("%s (size = %d) (idx = %d)", capkind(getkind(p)), getoff(p), p->i.aux2.key); break; } case IOpenCapture: { printf("%s (idx = %d)", capkind(getkind(p)), p->i.aux2.key); break; } case ISet: { printIcharset(p, (p+1)->buff); break; } case ITestSet: { printIcharset(p, (p+2)->buff); printjmp(op, p); break; } case ISpan: { printIcharset(p, (p+1)->buff); break; } case IOpenCall: { printf("-> %d", (p + 1)->offset); break; } case IBehind: { printf("%d", p->i.aux1); break; } case IJmp: case ICall: case ICommit: case IChoice: case IPartialCommit: case IBackCommit: case ITestAny: { printjmp(op, p); break; } default: break; } printf("\n"); } void printpatt (Instruction *p) { Instruction *op = p; uint n = op[-1].codesize - 1; while (p < op + n) { printinst(op, p); p += sizei(p); } } static void printcap (Capture *cap, int ident) { while (ident--) printf(" "); printf("%s (idx: %d - size: %d) -> %lu (%p)\n", capkind(cap->kind), cap->idx, cap->siz, (long)cap->index, (void*)cap); } /* ** Print a capture and its nested captures */ static Capture *printcap2close (Capture *cap, int ident) { Capture *head = cap++; printcap(head, ident); /* print head capture */ while (capinside(head, cap)) cap = printcap2close(cap, ident + 2); /* print nested captures */ if (isopencap(head)) { assert(isclosecap(cap)); printcap(cap++, ident); /* print and skip close capture */ } return cap; } void printcaplist (Capture *cap) { { /* for debugging, print first a raw list of captures */ Capture *c = cap; while (c->index != MAXINDT) { printcap(c, 0); c++; } } printf(">======\n"); while (!isclosecap(cap)) cap = printcap2close(cap, 0); printf("=======\n"); } /* }====================================================== */ /* ** {====================================================== ** Printing trees (for debugging) ** ======================================================= */ static const char *tagnames[] = { "char", "set", "any", "true", "false", "utf8.range", "rep", "seq", "choice", "not", "and", "call", "opencall", "rule", "xinfo", "grammar", "behind", "capture", "run-time" }; void printtree (TTree *tree, int ident) { int i; int sibs = numsiblings[tree->tag]; for (i = 0; i < ident; i++) printf(" "); printf("%s", tagnames[tree->tag]); switch (tree->tag) { case TChar: { int c = tree->u.n; if (isprint(c)) printf(" '%c'\n", c); else printf(" (%02X)\n", c); break; } case TSet: { printTcharset(tree); printf("\n"); break; } case TUTFR: { assert(sib1(tree)->tag == TXInfo); printf(" %d (%02x %d) - %d (%02x %d) \n", tree->u.n, tree->key, tree->cap, sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap); break; } case TOpenCall: case TCall: { assert(sib1(sib2(tree))->tag == TXInfo); printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); break; } case TBehind: { printf(" %d\n", tree->u.n); break; } case TCapture: { printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); break; } case TRule: { printf(" key: %d\n", tree->key); sibs = 1; /* do not print 'sib2' (next rule) as a sibling */ break; } case TXInfo: { printf(" n: %d\n", tree->u.n); break; } case TGrammar: { TTree *rule = sib1(tree); printf(" %d\n", tree->u.n); /* number of rules */ for (i = 0; i < tree->u.n; i++) { printtree(rule, ident + 2); rule = sib2(rule); } assert(rule->tag == TTrue); /* sentinel */ sibs = 0; /* siblings already handled */ break; } default: printf("\n"); break; } if (sibs >= 1) { printtree(sib1(tree), ident + 2); if (sibs >= 2) printtree(sib2(tree), ident + 2); } } void printktable (lua_State *L, int idx) { int n, i; lua_getuservalue(L, idx); if (lua_isnil(L, -1)) /* no ktable? */ return; n = lua_rawlen(L, -1); printf("["); for (i = 1; i <= n; i++) { printf("%d = ", i); lua_rawgeti(L, -1, i); if (lua_isstring(L, -1)) printf("%s ", lua_tostring(L, -1)); else printf("%s ", lua_typename(L, lua_type(L, -1))); lua_pop(L, 1); } printf("]\n"); /* leave ktable at the stack */ } /* }====================================================== */ #endif lpeg-1.1.0/lpcap.c0000664000175000017500000004341214446336477013645 0ustar robertoroberto #include "lua.h" #include "lauxlib.h" #include "lpcap.h" #include "lpprint.h" #include "lptypes.h" #define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v) #define pushluaval(cs) getfromktable(cs, (cs)->cap->idx) #define skipclose(cs,head) \ if (isopencap(head)) { assert(isclosecap(cs->cap)); cs->cap++; } /* ** Return the size of capture 'cap'. If it is an open capture, 'close' ** must be its corresponding close. */ static Index_t capsize (Capture *cap, Capture *close) { if (isopencap(cap)) { assert(isclosecap(close)); return close->index - cap->index; } else return cap->siz - 1; } static Index_t closesize (CapState *cs, Capture *head) { return capsize(head, cs->cap); } /* ** Put at the cache for Lua values the value indexed by 'v' in ktable ** of the running pattern (if it is not there yet); returns its index. */ static int updatecache (CapState *cs, int v) { int idx = cs->ptop + 1; /* stack index of cache for Lua values */ if (v != cs->valuecached) { /* not there? */ getfromktable(cs, v); /* get value from 'ktable' */ lua_replace(cs->L, idx); /* put it at reserved stack position */ cs->valuecached = v; /* keep track of what is there */ } return idx; } static int pushcapture (CapState *cs); /* ** Goes back in a list of captures looking for an open capture ** corresponding to a close one. */ static Capture *findopen (Capture *cap) { int n = 0; /* number of closes waiting an open */ for (;;) { cap--; if (isclosecap(cap)) n++; /* one more open to skip */ else if (isopencap(cap)) if (n-- == 0) return cap; } } /* ** Go to the next capture at the same level. */ static void nextcap (CapState *cs) { Capture *cap = cs->cap; if (isopencap(cap)) { /* must look for a close? */ int n = 0; /* number of opens waiting a close */ for (;;) { /* look for corresponding close */ cap++; if (isopencap(cap)) n++; else if (isclosecap(cap)) if (n-- == 0) break; } cs->cap = cap + 1; /* + 1 to skip last close */ } else { Capture *next; for (next = cap + 1; capinside(cap, next); next++) ; /* skip captures inside current one */ cs->cap = next; } } /* ** Push on the Lua stack all values generated by nested captures inside ** the current capture. Returns number of values pushed. 'addextra' ** makes it push the entire match after all captured values. The ** entire match is pushed also if there are no other nested values, ** so the function never returns zero. */ static int pushnestedvalues (CapState *cs, int addextra) { Capture *head = cs->cap++; /* original capture */ int n = 0; /* number of pushed subvalues */ /* repeat for all nested patterns */ while (capinside(head, cs->cap)) n += pushcapture(cs); if (addextra || n == 0) { /* need extra? */ lua_pushlstring(cs->L, cs->s + head->index, closesize(cs, head)); n++; } skipclose(cs, head); return n; } /* ** Push only the first value generated by nested captures */ static void pushonenestedvalue (CapState *cs) { int n = pushnestedvalues(cs, 0); if (n > 1) lua_pop(cs->L, n - 1); /* pop extra values */ } /* ** Checks whether group 'grp' is visible to 'ref', that is, 'grp' is ** not nested inside a full capture that does not contain 'ref'. (We ** only need to care for full captures because the search at 'findback' ** skips open-end blocks; so, if 'grp' is nested in a non-full capture, ** 'ref' is also inside it.) To check this, we search backward for the ** inner full capture enclosing 'grp'. A full capture cannot contain ** non-full captures, so a close capture means we cannot be inside a ** full capture anymore. */ static int capvisible (CapState *cs, Capture *grp, Capture *ref) { Capture *cap = grp; int i = MAXLOP; /* maximum distance for an 'open' */ while (i-- > 0 && cap-- > cs->ocap) { if (isclosecap(cap)) return 1; /* can stop the search */ else if (grp->index - cap->index >= UCHAR_MAX) return 1; /* can stop the search */ else if (capinside(cap, grp)) /* is 'grp' inside cap? */ return capinside(cap, ref); /* ok iff cap also contains 'ref' */ } return 1; /* 'grp' is not inside any capture */ } /* ** Try to find a named group capture with the name given at the top of ** the stack; goes backward from 'ref'. */ static Capture *findback (CapState *cs, Capture *ref) { lua_State *L = cs->L; Capture *cap = ref; while (cap-- > cs->ocap) { /* repeat until end of list */ if (isclosecap(cap)) cap = findopen(cap); /* skip nested captures */ else if (capinside(cap, ref)) continue; /* enclosing captures are not visible to 'ref' */ if (captype(cap) == Cgroup && capvisible(cs, cap, ref)) { getfromktable(cs, cap->idx); /* get group name */ if (lp_equal(L, -2, -1)) { /* right group? */ lua_pop(L, 2); /* remove reference name and group name */ return cap; } else lua_pop(L, 1); /* remove group name */ } } luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1)); return NULL; /* to avoid warnings */ } /* ** Back-reference capture. Return number of values pushed. */ static int backrefcap (CapState *cs) { int n; Capture *curr = cs->cap; pushluaval(cs); /* reference name */ cs->cap = findback(cs, curr); /* find corresponding group */ n = pushnestedvalues(cs, 0); /* push group's values */ cs->cap = curr + 1; return n; } /* ** Table capture: creates a new table and populates it with nested ** captures. */ static int tablecap (CapState *cs) { lua_State *L = cs->L; Capture *head = cs->cap++; int n = 0; lua_newtable(L); while (capinside(head, cs->cap)) { if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */ pushluaval(cs); /* push group name */ pushonenestedvalue(cs); lua_settable(L, -3); } else { /* not a named group */ int i; int k = pushcapture(cs); for (i = k; i > 0; i--) /* store all values into table */ lua_rawseti(L, -(i + 1), n + i); n += k; } } skipclose(cs, head); return 1; /* number of values pushed (only the table) */ } /* ** Table-query capture */ static int querycap (CapState *cs) { int idx = cs->cap->idx; pushonenestedvalue(cs); /* get nested capture */ lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */ if (!lua_isnil(cs->L, -1)) return 1; else { /* no value */ lua_pop(cs->L, 1); /* remove nil */ return 0; } } /* ** Fold capture */ static int foldcap (CapState *cs) { int n; lua_State *L = cs->L; Capture *head = cs->cap++; int idx = head->idx; if (isclosecap(cs->cap) || /* no nested captures (large subject)? */ (n = pushcapture(cs)) == 0) /* nested captures with no values? */ return luaL_error(L, "no initial value for fold capture"); if (n > 1) lua_pop(L, n - 1); /* leave only one result for accumulator */ while (capinside(head, cs->cap)) { lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */ lua_insert(L, -2); /* put it before accumulator */ n = pushcapture(cs); /* get next capture's values */ lua_call(L, n + 1, 1); /* call folding function */ } skipclose(cs, head); return 1; /* only accumulator left on the stack */ } /* ** Function capture */ static int functioncap (CapState *cs) { int n; int top = lua_gettop(cs->L); pushluaval(cs); /* push function */ n = pushnestedvalues(cs, 0); /* push nested captures */ lua_call(cs->L, n, LUA_MULTRET); /* call function */ return lua_gettop(cs->L) - top; /* return function's results */ } /* ** Accumulator capture */ static int accumulatorcap (CapState *cs) { lua_State *L = cs->L; int n; if (lua_gettop(L) < cs->firstcap) luaL_error(L, "no previous value for accumulator capture"); pushluaval(cs); /* push function */ lua_insert(L, -2); /* previous value becomes first argument */ n = pushnestedvalues(cs, 0); /* push nested captures */ lua_call(L, n + 1, 1); /* call function */ return 0; /* did not add any extra value */ } /* ** Select capture */ static int numcap (CapState *cs) { int idx = cs->cap->idx; /* value to select */ if (idx == 0) { /* no values? */ nextcap(cs); /* skip entire capture */ return 0; /* no value produced */ } else { int n = pushnestedvalues(cs, 0); if (n < idx) /* invalid index? */ return luaL_error(cs->L, "no capture '%d'", idx); else { lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */ lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */ lua_pop(cs->L, n - 1); /* remove other captures */ return 1; } } } /* ** Return the stack index of the first runtime capture in the given ** list of captures (or zero if no runtime captures) */ int finddyncap (Capture *cap, Capture *last) { for (; cap < last; cap++) { if (cap->kind == Cruntime) return cap->idx; /* stack position of first capture */ } return 0; /* no dynamic captures in this segment */ } /* ** Calls a runtime capture. Returns number of captures "removed" by the ** call, that is, those inside the group capture. Captures to be added ** are on the Lua stack. */ int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { int n, id; lua_State *L = cs->L; int otop = lua_gettop(L); Capture *open = findopen(close); /* get open group capture */ assert(captype(open) == Cgroup); id = finddyncap(open, close); /* get first dynamic capture argument */ close->kind = Cclose; /* closes the group */ close->index = s - cs->s; cs->cap = open; cs->valuecached = 0; /* prepare capture state */ luaL_checkstack(L, 4, "too many runtime captures"); pushluaval(cs); /* push function to be called */ lua_pushvalue(L, SUBJIDX); /* push original subject */ lua_pushinteger(L, s - cs->s + 1); /* push current position */ n = pushnestedvalues(cs, 0); /* push nested captures */ lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */ if (id > 0) { /* are there old dynamic captures to be removed? */ int i; for (i = id; i <= otop; i++) lua_remove(L, id); /* remove old dynamic captures */ *rem = otop - id + 1; /* total number of dynamic captures removed */ } else *rem = 0; /* no dynamic captures removed */ return close - open - 1; /* number of captures to be removed */ } /* ** Auxiliary structure for substitution and string captures: keep ** information about nested captures for future use, avoiding to push ** string results into Lua */ typedef struct StrAux { int isstring; /* whether capture is a string */ union { Capture *cp; /* if not a string, respective capture */ struct { /* if it is a string... */ Index_t idx; /* starts here */ Index_t siz; /* with this size */ } s; } u; } StrAux; #define MAXSTRCAPS 10 /* ** Collect values from current capture into array 'cps'. Current ** capture must be Cstring (first call) or Csimple (recursive calls). ** (In first call, fills %0 with whole match for Cstring.) ** Returns number of elements in the array that were filled. */ static int getstrcaps (CapState *cs, StrAux *cps, int n) { int k = n++; Capture *head = cs->cap++; cps[k].isstring = 1; /* get string value */ cps[k].u.s.idx = head->index; /* starts here */ while (capinside(head, cs->cap)) { if (n >= MAXSTRCAPS) /* too many captures? */ nextcap(cs); /* skip extra captures (will not need them) */ else if (captype(cs->cap) == Csimple) /* string? */ n = getstrcaps(cs, cps, n); /* put info. into array */ else { cps[n].isstring = 0; /* not a string */ cps[n].u.cp = cs->cap; /* keep original capture */ nextcap(cs); n++; } } cps[k].u.s.siz = closesize(cs, head); skipclose(cs, head); return n; } /* ** add next capture value (which should be a string) to buffer 'b' */ static int addonestring (luaL_Buffer *b, CapState *cs, const char *what); /* ** String capture: add result to buffer 'b' (instead of pushing ** it into the stack) */ static void stringcap (luaL_Buffer *b, CapState *cs) { StrAux cps[MAXSTRCAPS]; int n; size_t len, i; const char *fmt; /* format string */ fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len); n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */ for (i = 0; i < len; i++) { /* traverse format string */ if (fmt[i] != '%') /* not an escape? */ luaL_addchar(b, fmt[i]); /* add it to buffer */ else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */ luaL_addchar(b, fmt[i]); /* add to buffer */ else { int l = fmt[i] - '0'; /* capture index */ if (l > n) luaL_error(cs->L, "invalid capture index (%d)", l); else if (cps[l].isstring) luaL_addlstring(b, cs->s + cps[l].u.s.idx, cps[l].u.s.siz); else { Capture *curr = cs->cap; cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */ if (!addonestring(b, cs, "capture")) luaL_error(cs->L, "no values in capture index %d", l); cs->cap = curr; /* continue from where it stopped */ } } } } /* ** Substitution capture: add result to buffer 'b' */ static void substcap (luaL_Buffer *b, CapState *cs) { const char *curr = cs->s + cs->cap->index; Capture *head = cs->cap++; while (capinside(head, cs->cap)) { Capture *cap = cs->cap; const char *caps = cs->s + cap->index; luaL_addlstring(b, curr, caps - curr); /* add text up to capture */ if (addonestring(b, cs, "replacement")) curr = caps + capsize(cap, cs->cap - 1); /* continue after match */ else /* no capture value */ curr = caps; /* keep original text in final result */ } /* add last piece of text */ luaL_addlstring(b, curr, cs->s + head->index + closesize(cs, head) - curr); skipclose(cs, head); } /* ** Evaluates a capture and adds its first value to buffer 'b'; returns ** whether there was a value */ static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { switch (captype(cs->cap)) { case Cstring: stringcap(b, cs); /* add capture directly to buffer */ return 1; case Csubst: substcap(b, cs); /* add capture directly to buffer */ return 1; case Cacc: /* accumulator capture? */ return luaL_error(cs->L, "invalid context for an accumulator capture"); default: { lua_State *L = cs->L; int n = pushcapture(cs); if (n > 0) { if (n > 1) lua_pop(L, n - 1); /* only one result */ if (!lua_isstring(L, -1)) return luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1)); luaL_addvalue(b); } return n; } } } #if !defined(MAXRECLEVEL) #define MAXRECLEVEL 200 #endif /* ** Push all values of the current capture into the stack; returns ** number of values pushed */ static int pushcapture (CapState *cs) { lua_State *L = cs->L; int res; luaL_checkstack(L, 4, "too many captures"); if (cs->reclevel++ > MAXRECLEVEL) return luaL_error(L, "subcapture nesting too deep"); switch (captype(cs->cap)) { case Cposition: { lua_pushinteger(L, cs->cap->index + 1); cs->cap++; res = 1; break; } case Cconst: { pushluaval(cs); cs->cap++; res = 1; break; } case Carg: { int arg = (cs->cap++)->idx; if (arg + FIXEDARGS > cs->ptop) return luaL_error(L, "reference to absent extra argument #%d", arg); lua_pushvalue(L, arg + FIXEDARGS); res = 1; break; } case Csimple: { int k = pushnestedvalues(cs, 1); lua_insert(L, -k); /* make whole match be first result */ res = k; break; } case Cruntime: { lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */ res = 1; break; } case Cstring: { luaL_Buffer b; luaL_buffinit(L, &b); stringcap(&b, cs); luaL_pushresult(&b); res = 1; break; } case Csubst: { luaL_Buffer b; luaL_buffinit(L, &b); substcap(&b, cs); luaL_pushresult(&b); res = 1; break; } case Cgroup: { if (cs->cap->idx == 0) /* anonymous group? */ res = pushnestedvalues(cs, 0); /* add all nested values */ else { /* named group: add no values */ nextcap(cs); /* skip capture */ res = 0; } break; } case Cbackref: res = backrefcap(cs); break; case Ctable: res = tablecap(cs); break; case Cfunction: res = functioncap(cs); break; case Cacc: res = accumulatorcap(cs); break; case Cnum: res = numcap(cs); break; case Cquery: res = querycap(cs); break; case Cfold: res = foldcap(cs); break; default: assert(0); res = 0; } cs->reclevel--; return res; } /* ** Prepare a CapState structure and traverse the entire list of ** captures in the stack pushing its results. 's' is the subject ** string, 'r' is the final position of the match, and 'ptop' ** the index in the stack where some useful values were pushed. ** Returns the number of results pushed. (If the list produces no ** results, push the final position of the match.) */ int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop)); int n = 0; /* printcaplist(capture); */ if (!isclosecap(capture)) { /* is there any capture? */ CapState cs; cs.ocap = cs.cap = capture; cs.L = L; cs.reclevel = 0; cs.s = s; cs.valuecached = 0; cs.ptop = ptop; cs.firstcap = lua_gettop(L) + 1; /* where first value (if any) will go */ do { /* collect their values */ n += pushcapture(&cs); } while (!isclosecap(cs.cap)); assert(lua_gettop(L) - cs.firstcap == n - 1); } if (n == 0) { /* no capture values? */ lua_pushinteger(L, r - s + 1); /* return only end position */ n = 1; } return n; } lpeg-1.1.0/lpcset.c0000664000175000017500000000632014446336477014035 0ustar robertoroberto #include "lptypes.h" #include "lpcset.h" /* ** Add to 'c' the index of the (only) bit set in byte 'b' */ static int onlybit (int c, int b) { if ((b & 0xF0) != 0) { c += 4; b >>= 4; } if ((b & 0x0C) != 0) { c += 2; b >>= 2; } if ((b & 0x02) != 0) { c += 1; } return c; } /* ** Check whether a charset is empty (returns IFail), singleton (IChar), ** full (IAny), or none of those (ISet). When singleton, 'info.offset' ** returns which character it is. When generic set, 'info' returns ** information about its range. */ Opcode charsettype (const byte *cs, charsetinfo *info) { int low0, low1, high0, high1; for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++) /* find lowest byte with a 1-bit */; if (low1 == CHARSETSIZE) return IFail; /* no characters in set */ for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--) /* find highest byte with a 1-bit; low1 is a sentinel */; if (low1 == high1) { /* only one byte with 1-bits? */ int b = cs[low1]; if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ info->offset = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ return IChar; /* single character */ } } for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++) /* find lowest byte with a 0-bit */; if (low0 == CHARSETSIZE) return IAny; /* set has all bits set */ for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) /* find highest byte with a 0-bit; low0 is a sentinel */; if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ info->offset = low1; info->size = high1 - low1 + 1; info->deflt = 0; /* all discharged bits were 0 */ } else { info->offset = low0; info->size = high0 - low0 + 1; info->deflt = 0xFF; /* all discharged bits were 1 */ } info->cs = cs + info->offset; return ISet; } /* ** Get a byte from a compact charset. If index is inside the charset ** range, get the byte from the supporting charset (correcting it ** by the offset). Otherwise, return the default for the set. */ byte getbytefromcharset (const charsetinfo *info, int index) { if (index < info->size) return info->cs[index]; else return info->deflt; } /* ** If 'tree' is a 'char' pattern (TSet, TChar, TAny, TFalse), convert it ** into a charset and return 1; else return 0. */ int tocharset (TTree *tree, Charset *cs) { switch (tree->tag) { case TChar: { /* only one char */ assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); clearset(cs->cs); /* erase all chars */ setchar(cs->cs, tree->u.n); /* add that one */ return 1; } case TAny: { fillset(cs->cs, 0xFF); /* add all characters to the set */ return 1; } case TFalse: { clearset(cs->cs); /* empty set */ return 1; } case TSet: { /* fill set */ int i; fillset(cs->cs, tree->u.set.deflt); for (i = 0; i < tree->u.set.size; i++) cs->cs[tree->u.set.offset + i] = treebuffer(tree)[i]; return 1; } default: return 0; } } void tree2cset (TTree *tree, charsetinfo *info) { assert(tree->tag == TSet); info->offset = tree->u.set.offset; info->size = tree->u.set.size; info->deflt = tree->u.set.deflt; info->cs = treebuffer(tree); } lpeg-1.1.0/lpprint.h0000664000175000017500000000120114446336477014231 0ustar robertoroberto #if !defined(lpprint_h) #define lpprint_h #include "lptree.h" #include "lpvm.h" #if defined(LPEG_DEBUG) void printpatt (Instruction *p); void printtree (TTree *tree, int ident); void printktable (lua_State *L, int idx); void printcharset (const byte *st); void printcaplist (Capture *cap); void printinst (const Instruction *op, const Instruction *p); #else #define printktable(L,idx) \ luaL_error(L, "function only implemented in debug mode") #define printtree(tree,i) \ luaL_error(L, "function only implemented in debug mode") #define printpatt(p) \ luaL_error(L, "function only implemented in debug mode") #endif #endif lpeg-1.1.0/re.lua0000664000175000017500000001464214446336477013516 0ustar robertoroberto-- -- Copyright 2007-2023, Lua.org & PUC-Rio (see 'lpeg.html' for license) -- written by Roberto Ierusalimschy -- -- imported functions and modules local tonumber, type, print, error = tonumber, type, print, error local setmetatable = setmetatable local m = require"lpeg" -- 'm' will be used to parse expressions, and 'mm' will be used to -- create expressions; that is, 're' runs on 'm', creating patterns -- on 'mm' local mm = m -- patterns' metatable local mt = getmetatable(mm.P(0)) local version = _VERSION -- No more global accesses after this point _ENV = nil -- does no harm in Lua 5.1 local any = m.P(1) -- Pre-defined names local Predef = { nl = m.P"\n" } local mem local fmem local gmem local function updatelocale () mm.locale(Predef) Predef.a = Predef.alpha Predef.c = Predef.cntrl Predef.d = Predef.digit Predef.g = Predef.graph Predef.l = Predef.lower Predef.p = Predef.punct Predef.s = Predef.space Predef.u = Predef.upper Predef.w = Predef.alnum Predef.x = Predef.xdigit Predef.A = any - Predef.a Predef.C = any - Predef.c Predef.D = any - Predef.d Predef.G = any - Predef.g Predef.L = any - Predef.l Predef.P = any - Predef.p Predef.S = any - Predef.s Predef.U = any - Predef.u Predef.W = any - Predef.w Predef.X = any - Predef.x mem = {} -- restart memoization fmem = {} gmem = {} local mt = {__mode = "v"} setmetatable(mem, mt) setmetatable(fmem, mt) setmetatable(gmem, mt) end updatelocale() local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) local function patt_error (s, i) local msg = (#s < i + 20) and s:sub(i) or s:sub(i,i+20) .. "..." msg = ("pattern error near '%s'"):format(msg) error(msg, 2) end local function mult (p, n) local np = mm.P(true) while n >= 1 do if n%2 >= 1 then np = np * p end p = p * p n = n/2 end return np end local function equalcap (s, i, c) if type(c) ~= "string" then return nil end local e = #c + i if s:sub(i, e - 1) == c then return e else return nil end end local S = (Predef.space + "--" * (any - Predef.nl)^0)^0 local name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")^0 local arrow = S * "<-" local seq_follow = m.P"/" + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1 name = m.C(name) -- a defined name only have meaning in a given environment local Def = name * m.Carg(1) local function getdef (id, defs) local c = defs and defs[id] if not c then error("undefined name: " .. id) end return c end -- match a name and return a group of its corresponding definition -- and 'f' (to be folded in 'Suffix') local function defwithfunc (f) return m.Cg(Def / getdef * m.Cc(f)) end local num = m.C(m.R"09"^1) * S / tonumber local String = "'" * m.C((any - "'")^0) * "'" + '"' * m.C((any - '"')^0) * '"' local defined = "%" * Def / function (c,Defs) local cat = Defs and Defs[c] or Predef[c] if not cat then error ("name '" .. c .. "' undefined") end return cat end local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R local item = (defined + Range + m.C(any)) / m.P local Class = "[" * (m.C(m.P"^"^-1)) -- optional complement symbol * (item * ((item % mt.__add) - "]")^0) / function (c, p) return c == "^" and any - p or p end * "]" local function adddef (t, k, exp) if t[k] then error("'"..k.."' already defined as a rule") else t[k] = exp end return t end local function firstdef (n, r) return adddef({n}, n, r) end local function NT (n, b) if not b then error("rule '"..n.."' used outside a grammar") else return mm.V(n) end end local exp = m.P{ "Exp", Exp = S * ( m.V"Grammar" + m.V"Seq" * ("/" * S * m.V"Seq" % mt.__add)^0 ); Seq = (m.Cc(m.P"") * (m.V"Prefix" % mt.__mul)^0) * (#seq_follow + patt_error); Prefix = "&" * S * m.V"Prefix" / mt.__len + "!" * S * m.V"Prefix" / mt.__unm + m.V"Suffix"; Suffix = m.V"Primary" * S * ( ( m.P"+" * m.Cc(1, mt.__pow) + m.P"*" * m.Cc(0, mt.__pow) + m.P"?" * m.Cc(-1, mt.__pow) + "^" * ( m.Cg(num * m.Cc(mult)) + m.Cg(m.C(m.S"+-" * m.R"09"^1) * m.Cc(mt.__pow)) ) + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) + m.P"{}" * m.Cc(nil, m.Ct) + defwithfunc(mt.__div) ) + "=>" * S * defwithfunc(mm.Cmt) + ">>" * S * defwithfunc(mt.__mod) + "~>" * S * defwithfunc(mm.Cf) ) % function (a,b,f) return f(a,b) end * S )^0; Primary = "(" * m.V"Exp" * ")" + String / mm.P + Class + defined + "{:" * (name * ":" + m.Cc(nil)) * m.V"Exp" * ":}" / function (n, p) return mm.Cg(p, n) end + "=" * name / function (n) return mm.Cmt(mm.Cb(n), equalcap) end + m.P"{}" / mm.Cp + "{~" * m.V"Exp" * "~}" / mm.Cs + "{|" * m.V"Exp" * "|}" / mm.Ct + "{" * m.V"Exp" * "}" / mm.C + m.P"." * m.Cc(any) + (name * -arrow + "<" * name * ">") * m.Cb("G") / NT; Definition = name * arrow * m.V"Exp"; Grammar = m.Cg(m.Cc(true), "G") * ((m.V"Definition" / firstdef) * (m.V"Definition" % adddef)^0) / mm.P } local pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error) local function compile (p, defs) if mm.type(p) == "pattern" then return p end -- already compiled local cp = pattern:match(p, 1, defs) if not cp then error("incorrect pattern", 3) end return cp end local function match (s, p, i) local cp = mem[p] if not cp then cp = compile(p) mem[p] = cp end return cp:match(s, i or 1) end local function find (s, p, i) local cp = fmem[p] if not cp then cp = compile(p) / 0 cp = mm.P{ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) } fmem[p] = cp end local i, e = cp:match(s, i or 1) if i then return i, e - 1 else return i end end local function gsub (s, p, rep) local g = gmem[p] or {} -- ensure gmem[p] is not collected while here gmem[p] = g local cp = g[rep] if not cp then cp = compile(p) cp = mm.Cs((cp / rep + 1)^0) g[rep] = cp end return cp:match(s) end -- exported names local re = { compile = compile, match = match, find = find, gsub = gsub, updatelocale = updatelocale, } if version == "Lua 5.1" then _G.re = re end return re lpeg-1.1.0/lpvm.h0000664000175000017500000000445014446336477013530 0ustar robertoroberto #if !defined(lpvm_h) #define lpvm_h #include "lpcap.h" /* ** About Character sets in instructions: a set is a bit map with an ** initial offset, in bits, and a size, in number of instructions. ** aux1 has the default value for the bits outsize that range. */ /* Virtual Machine's instructions */ typedef enum Opcode { IAny, /* if no char, fail */ IChar, /* if char != aux1, fail */ ISet, /* if char not in set, fail */ ITestAny, /* in no char, jump to 'offset' */ ITestChar, /* if char != aux1, jump to 'offset' */ ITestSet, /* if char not in set, jump to 'offset' */ ISpan, /* read a span of chars in set */ IUTFR, /* if codepoint not in range [offset, utf_to], fail */ IBehind, /* walk back 'aux1' characters (fail if not possible) */ IRet, /* return from a rule */ IEnd, /* end of pattern */ IChoice, /* stack a choice; next fail will jump to 'offset' */ IJmp, /* jump to 'offset' */ ICall, /* call rule at 'offset' */ IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ ICommit, /* pop choice and jump to 'offset' */ IPartialCommit, /* update top choice to current position and jump */ IBackCommit, /* backtrack like "fail" but jump to its own 'offset' */ IFailTwice, /* pop one choice and then fail */ IFail, /* go back to saved state on choice and jump to saved offset */ IGiveup, /* internal use */ IFullCapture, /* complete capture of last 'off' chars */ IOpenCapture, /* start a capture */ ICloseCapture, ICloseRunTime, IEmpty /* to fill empty slots left by optimizations */ } Opcode; /* ** All array of instructions has a 'codesize' as its first element ** and is referred by a pointer to its second element, which is the ** first actual opcode. */ typedef union Instruction { struct Inst { byte code; byte aux1; union { short key; struct { byte offset; byte size; } set; } aux2; } i; int offset; uint codesize; byte buff[1]; } Instruction; /* extract 24-bit value from an instruction */ #define utf_to(inst) (((inst)->i.aux2.key << 8) | (inst)->i.aux1) int charinset (const Instruction *i, const byte *buff, uint c); const char *match (lua_State *L, const char *o, const char *s, const char *e, Instruction *op, Capture *capture, int ptop); #endif lpeg-1.1.0/makefile0000664000175000017500000000244214446336477014100 0ustar robertorobertoLIBNAME = lpeg LUADIR = ./lua/ COPT = -O2 -DNDEBUG # COPT = -O0 -DLPEG_DEBUG -g CWARNS = -Wall -Wextra -pedantic \ -Waggregate-return \ -Wcast-align \ -Wcast-qual \ -Wdisabled-optimization \ -Wpointer-arith \ -Wshadow \ -Wredundant-decls \ -Wsign-compare \ -Wundef \ -Wwrite-strings \ -Wbad-function-cast \ -Wdeclaration-after-statement \ -Wmissing-prototypes \ -Wmissing-declarations \ -Wnested-externs \ -Wstrict-prototypes \ -Wc++-compat \ # -Wunreachable-code \ CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC CC = gcc FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o lpcset.o # For Linux linux: $(MAKE) lpeg.so "DLLFLAGS = -shared -fPIC" # For Mac OS macosx: $(MAKE) lpeg.so "DLLFLAGS = -bundle -undefined dynamic_lookup" lpeg.so: $(FILES) env $(CC) $(DLLFLAGS) $(FILES) -o lpeg.so $(FILES): makefile test: test.lua re.lua lpeg.so ./test.lua clean: rm -f $(FILES) lpeg.so lpcap.o: lpcap.c lpcap.h lptypes.h lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h lpcset.h lpcset.o: lpcset.c lptypes.h lpcset.h lpcode.h lptree.h lpvm.h lpcap.h lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h lpcode.h lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h \ lpcset.h lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h lpeg-1.1.0/lpcode.h0000664000175000017500000000124314446336477014015 0ustar robertoroberto #if !defined(lpcode_h) #define lpcode_h #include "lua.h" #include "lptypes.h" #include "lptree.h" #include "lpvm.h" int checkaux (TTree *tree, int pred); int fixedlen (TTree *tree); int hascaptures (TTree *tree); int lp_gc (lua_State *L); Instruction *compile (lua_State *L, Pattern *p, uint size); void freecode (lua_State *L, Pattern *p); int sizei (const Instruction *i); #define PEnullable 0 #define PEnofail 1 /* ** nofail(t) implies that 't' cannot fail with any input */ #define nofail(t) checkaux(t, PEnofail) /* ** (not nullable(t)) implies 't' cannot match without consuming ** something */ #define nullable(t) checkaux(t, PEnullable) #endif lpeg-1.1.0/lpcode.c0000664000175000017500000007756414446336477014033 0ustar robertoroberto #include #include "lua.h" #include "lauxlib.h" #include "lptypes.h" #include "lpcode.h" #include "lpcset.h" /* signals a "no-instruction */ #define NOINST -1 static const Charset fullset_ = {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; static const Charset *fullset = &fullset_; /* ** {====================================================== ** Analysis and some optimizations ** ======================================================= */ /* ** A few basic operations on Charsets */ static void cs_complement (Charset *cs) { loopset(i, cs->cs[i] = ~cs->cs[i]); } static int cs_disjoint (const Charset *cs1, const Charset *cs2) { loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) return 1; } /* ** Visit a TCall node taking care to stop recursion. If node not yet ** visited, return 'f(sib2(tree))', otherwise return 'def' (default ** value) */ static int callrecursive (TTree *tree, int f (TTree *t), int def) { int key = tree->key; assert(tree->tag == TCall); assert(sib2(tree)->tag == TRule); if (key == 0) /* node already visited? */ return def; /* return default value */ else { /* first visit */ int result; tree->key = 0; /* mark call as already visited */ result = f(sib2(tree)); /* go to called rule */ tree->key = key; /* restore tree */ return result; } } /* ** Check whether a pattern tree has captures */ int hascaptures (TTree *tree) { tailcall: switch (tree->tag) { case TCapture: case TRunTime: return 1; case TCall: return callrecursive(tree, hascaptures, 0); case TRule: /* do not follow siblings */ tree = sib1(tree); goto tailcall; case TOpenCall: assert(0); default: { switch (numsiblings[tree->tag]) { case 1: /* return hascaptures(sib1(tree)); */ tree = sib1(tree); goto tailcall; case 2: if (hascaptures(sib1(tree))) return 1; /* else return hascaptures(sib2(tree)); */ tree = sib2(tree); goto tailcall; default: assert(numsiblings[tree->tag] == 0); return 0; } } } } /* ** Checks how a pattern behaves regarding the empty string, ** in one of two different ways: ** A pattern is *nullable* if it can match without consuming any character; ** A pattern is *nofail* if it never fails for any string ** (including the empty string). ** The difference is only for predicates and run-time captures; ** for other patterns, the two properties are equivalent. ** (With predicates, &'a' is nullable but not nofail. Of course, ** nofail => nullable.) ** These functions are all convervative in the following way: ** p is nullable => nullable(p) ** nofail(p) => p cannot fail ** The function assumes that TOpenCall is not nullable; ** this will be checked again when the grammar is fixed. ** Run-time captures can do whatever they want, so the result ** is conservative. */ int checkaux (TTree *tree, int pred) { tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TOpenCall: return 0; /* not nullable */ case TRep: case TTrue: return 1; /* no fail */ case TNot: case TBehind: /* can match empty, but can fail */ if (pred == PEnofail) return 0; else return 1; /* PEnullable */ case TAnd: /* can match empty; fail iff body does */ if (pred == PEnullable) return 1; /* else return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; case TRunTime: /* can fail; match empty iff body does */ if (pred == PEnofail) return 0; /* else return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; case TSeq: if (!checkaux(sib1(tree), pred)) return 0; /* else return checkaux(sib2(tree), pred); */ tree = sib2(tree); goto tailcall; case TChoice: if (checkaux(sib2(tree), pred)) return 1; /* else return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; case TCapture: case TGrammar: case TRule: case TXInfo: /* return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; case TCall: /* return checkaux(sib2(tree), pred); */ tree = sib2(tree); goto tailcall; default: assert(0); return 0; } } /* ** number of characters to match a pattern (or -1 if variable) */ int fixedlen (TTree *tree) { int len = 0; /* to accumulate in tail calls */ tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: return len + 1; case TUTFR: return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; case TFalse: case TTrue: case TNot: case TAnd: case TBehind: return len; case TRep: case TRunTime: case TOpenCall: return -1; case TCapture: case TRule: case TGrammar: case TXInfo: /* return fixedlen(sib1(tree)); */ tree = sib1(tree); goto tailcall; case TCall: { int n1 = callrecursive(tree, fixedlen, -1); if (n1 < 0) return -1; else return len + n1; } case TSeq: { int n1 = fixedlen(sib1(tree)); if (n1 < 0) return -1; /* else return fixedlen(sib2(tree)) + len; */ len += n1; tree = sib2(tree); goto tailcall; } case TChoice: { int n1 = fixedlen(sib1(tree)); int n2 = fixedlen(sib2(tree)); if (n1 != n2 || n1 < 0) return -1; else return len + n1; } default: assert(0); return 0; }; } /* ** Computes the 'first set' of a pattern. ** The result is a conservative aproximation: ** match p ax -> x (for some x) ==> a belongs to first(p) ** or ** a not in first(p) ==> match p ax -> fail (for all x) ** ** The set 'follow' is the first set of what follows the ** pattern (full set if nothing follows it). ** ** The function returns 0 when this resulting set can be used for ** test instructions that avoid the pattern altogether. ** A non-zero return can happen for two reasons: ** 1) match p '' -> '' ==> return has bit 1 set ** (tests cannot be used because they would always fail for an empty input); ** 2) there is a match-time capture ==> return has bit 2 set ** (optimizations should not bypass match-time captures). */ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: case TFalse: { tocharset(tree, firstset); return 0; } case TUTFR: { int c; clearset(firstset->cs); /* erase all chars */ for (c = tree->key; c <= sib1(tree)->key; c++) setchar(firstset->cs, c); return 0; } case TTrue: { loopset(i, firstset->cs[i] = follow->cs[i]); return 1; /* accepts the empty string */ } case TChoice: { Charset csaux; int e1 = getfirst(sib1(tree), follow, firstset); int e2 = getfirst(sib2(tree), follow, &csaux); loopset(i, firstset->cs[i] |= csaux.cs[i]); return e1 | e2; } case TSeq: { if (!nullable(sib1(tree))) { /* when p1 is not nullable, p2 has nothing to contribute; return getfirst(sib1(tree), fullset, firstset); */ tree = sib1(tree); follow = fullset; goto tailcall; } else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ Charset csaux; int e2 = getfirst(sib2(tree), follow, &csaux); int e1 = getfirst(sib1(tree), &csaux, firstset); if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ return 2; /* pattern has a matchtime capture */ else return e2; /* else depends on 'e2' */ } } case TRep: { getfirst(sib1(tree), follow, firstset); loopset(i, firstset->cs[i] |= follow->cs[i]); return 1; /* accept the empty string */ } case TCapture: case TGrammar: case TRule: case TXInfo: { /* return getfirst(sib1(tree), follow, firstset); */ tree = sib1(tree); goto tailcall; } case TRunTime: { /* function invalidates any follow info. */ int e = getfirst(sib1(tree), fullset, firstset); if (e) return 2; /* function is not "protected"? */ else return 0; /* pattern inside capture ensures first can be used */ } case TCall: { /* return getfirst(sib2(tree), follow, firstset); */ tree = sib2(tree); goto tailcall; } case TAnd: { int e = getfirst(sib1(tree), follow, firstset); loopset(i, firstset->cs[i] &= follow->cs[i]); return e; } case TNot: { if (tocharset(sib1(tree), firstset)) { cs_complement(firstset); return 1; } /* else */ } /* FALLTHROUGH */ case TBehind: { /* instruction gives no new information */ /* call 'getfirst' only to check for math-time captures */ int e = getfirst(sib1(tree), follow, firstset); loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ return e | 1; /* always can accept the empty string */ } default: assert(0); return 0; } } /* ** If 'headfail(tree)' true, then 'tree' can fail only depending on the ** next character of the subject. */ static int headfail (TTree *tree) { tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: case TFalse: return 1; case TTrue: case TRep: case TRunTime: case TNot: case TBehind: case TUTFR: return 0; case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ case TCall: tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ case TSeq: if (!nofail(sib2(tree))) return 0; /* else return headfail(sib1(tree)); */ tree = sib1(tree); goto tailcall; case TChoice: if (!headfail(sib1(tree))) return 0; /* else return headfail(sib2(tree)); */ tree = sib2(tree); goto tailcall; default: assert(0); return 0; } } /* ** Check whether the code generation for the given tree can benefit ** from a follow set (to avoid computing the follow set when it is ** not needed) */ static int needfollow (TTree *tree) { tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TTrue: case TAnd: case TNot: case TRunTime: case TGrammar: case TCall: case TBehind: return 0; case TChoice: case TRep: return 1; case TCapture: tree = sib1(tree); goto tailcall; case TSeq: tree = sib2(tree); goto tailcall; default: assert(0); return 0; } } /* }====================================================== */ /* ** {====================================================== ** Code generation ** ======================================================= */ /* ** size of an instruction */ int sizei (const Instruction *i) { switch((Opcode)i->i.code) { case ISet: case ISpan: return 1 + i->i.aux2.set.size; case ITestSet: return 2 + i->i.aux2.set.size; case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: case IUTFR: return 2; default: return 1; } } /* ** state for the compiler */ typedef struct CompileState { Pattern *p; /* pattern being compiled */ int ncode; /* next position in p->code to be filled */ lua_State *L; } CompileState; /* ** code generation is recursive; 'opt' indicates that the code is being ** generated as the last thing inside an optional pattern (so, if that ** code is optional too, it can reuse the 'IChoice' already in place for ** the outer pattern). 'tt' points to a previous test protecting this ** code (or NOINST). 'fl' is the follow set of the pattern. */ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, const Charset *fl); static void finishrelcode (lua_State *L, Pattern *p, Instruction *block, int size) { if (block == NULL) luaL_error(L, "not enough memory"); block->codesize = size; p->code = (Instruction *)block + 1; } /* ** Initialize array 'p->code' */ static void newcode (lua_State *L, Pattern *p, int size) { void *ud; Instruction *block; lua_Alloc f = lua_getallocf(L, &ud); size++; /* slot for 'codesize' */ block = (Instruction*) f(ud, NULL, 0, size * sizeof(Instruction)); finishrelcode(L, p, block, size); } void freecode (lua_State *L, Pattern *p) { if (p->code != NULL) { void *ud; lua_Alloc f = lua_getallocf(L, &ud); uint osize = p->code[-1].codesize; f(ud, p->code - 1, osize * sizeof(Instruction), 0); /* free block */ } } /* ** Assume that 'nsize' is not zero and that 'p->code' already exists. */ static void realloccode (lua_State *L, Pattern *p, int nsize) { void *ud; lua_Alloc f = lua_getallocf(L, &ud); Instruction *block = p->code - 1; uint osize = block->codesize; nsize++; /* add the 'codesize' slot to size */ block = (Instruction*) f(ud, block, osize * sizeof(Instruction), nsize * sizeof(Instruction)); finishrelcode(L, p, block, nsize); } /* ** Add space for an instruction with 'n' slots and return its index. */ static int nextinstruction (CompileState *compst, int n) { int size = compst->p->code[-1].codesize - 1; int ncode = compst->ncode; if (ncode > size - n) { uint nsize = size + (size >> 1) + n; if (nsize >= INT_MAX) luaL_error(compst->L, "pattern code too large"); realloccode(compst->L, compst->p, nsize); } compst->ncode = ncode + n; return ncode; } #define getinstr(cs,i) ((cs)->p->code[i]) static int addinstruction (CompileState *compst, Opcode op, int aux) { int i = nextinstruction(compst, 1); getinstr(compst, i).i.code = op; getinstr(compst, i).i.aux1 = aux; return i; } /* ** Add an instruction followed by space for an offset (to be set later) */ static int addoffsetinst (CompileState *compst, Opcode op) { int i = addinstruction(compst, op, 0); /* instruction */ addinstruction(compst, (Opcode)0, 0); /* open space for offset */ assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); return i; } /* ** Set the offset of an instruction */ static void setoffset (CompileState *compst, int instruction, int offset) { getinstr(compst, instruction + 1).offset = offset; } static void codeutfr (CompileState *compst, TTree *tree) { int i = addoffsetinst(compst, IUTFR); int to = sib1(tree)->u.n; assert(sib1(tree)->tag == TXInfo); getinstr(compst, i + 1).offset = tree->u.n; getinstr(compst, i).i.aux1 = to & 0xff; getinstr(compst, i).i.aux2.key = to >> 8; } /* ** Add a capture instruction: ** 'op' is the capture instruction; 'cap' the capture kind; ** 'key' the key into ktable; 'aux' is the optional capture offset ** */ static int addinstcap (CompileState *compst, Opcode op, int cap, int key, int aux) { int i = addinstruction(compst, op, joinkindoff(cap, aux)); getinstr(compst, i).i.aux2.key = key; return i; } #define gethere(compst) ((compst)->ncode) #define target(code,i) ((i) + code[i + 1].offset) /* ** Patch 'instruction' to jump to 'target' */ static void jumptothere (CompileState *compst, int instruction, int target) { if (instruction >= 0) setoffset(compst, instruction, target - instruction); } /* ** Patch 'instruction' to jump to current position */ static void jumptohere (CompileState *compst, int instruction) { jumptothere(compst, instruction, gethere(compst)); } /* ** Code an IChar instruction, or IAny if there is an equivalent ** test dominating it */ static void codechar (CompileState *compst, int c, int tt) { if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && getinstr(compst, tt).i.aux1 == c) addinstruction(compst, IAny, 0); else addinstruction(compst, IChar, c); } /* ** Add a charset posfix to an instruction. */ static void addcharset (CompileState *compst, int inst, charsetinfo *info) { int p; Instruction *I = &getinstr(compst, inst); byte *charset; int isize = instsize(info->size); /* size in instructions */ int i; I->i.aux2.set.offset = info->offset * 8; /* offset in bits */ I->i.aux2.set.size = isize; I->i.aux1 = info->deflt; p = nextinstruction(compst, isize); /* space for charset */ charset = getinstr(compst, p).buff; /* charset buffer */ for (i = 0; i < isize * (int)sizeof(Instruction); i++) charset[i] = getbytefromcharset(info, i); /* copy the buffer */ } /* ** Check whether charset 'info' is dominated by instruction 'p' */ static int cs_equal (Instruction *p, charsetinfo *info) { if (p->i.code != ITestSet) return 0; else if (p->i.aux2.set.offset != info->offset * 8 || p->i.aux2.set.size != instsize(info->size) || p->i.aux1 != info->deflt) return 0; else { int i; for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { if ((p + 2)->buff[i] != getbytefromcharset(info, i)) return 0; } } return 1; } /* ** Code a char set, using IAny when instruction is dominated by an ** equivalent test. */ static void codecharset (CompileState *compst, TTree *tree, int tt) { charsetinfo info; tree2cset(tree, &info); if (tt >= 0 && cs_equal(&getinstr(compst, tt), &info)) addinstruction(compst, IAny, 0); else { int i = addinstruction(compst, ISet, 0); addcharset(compst, i, &info); } } /* ** Code a test set, optimizing unit sets for ITestChar, "complete" ** sets for ITestAny, and empty sets for IJmp (always fails). ** 'e' is true iff test should accept the empty string. (Test ** instructions in the current VM never accept the empty string.) */ static int codetestset (CompileState *compst, Charset *cs, int e) { if (e) return NOINST; /* no test */ else { charsetinfo info; Opcode op = charsettype(cs->cs, &info); switch (op) { case IFail: return addoffsetinst(compst, IJmp); /* always jump */ case IAny: return addoffsetinst(compst, ITestAny); case IChar: { int i = addoffsetinst(compst, ITestChar); getinstr(compst, i).i.aux1 = info.offset; return i; } default: { /* regular set */ int i = addoffsetinst(compst, ITestSet); addcharset(compst, i, &info); assert(op == ISet); return i; } } } } /* ** Find the final destination of a sequence of jumps */ static int finaltarget (Instruction *code, int i) { while (code[i].i.code == IJmp) i = target(code, i); return i; } /* ** final label (after traversing any jumps) */ static int finallabel (Instruction *code, int i) { return finaltarget(code, target(code, i)); } /* ** == behind n;

(where n = fixedlen(p)) */ static void codebehind (CompileState *compst, TTree *tree) { if (tree->u.n > 0) addinstruction(compst, IBehind, tree->u.n); codegen(compst, sib1(tree), 0, NOINST, fullset); } /* ** Choice; optimizations: ** - when p1 is headfail or when first(p1) and first(p2) are disjoint, ** than a character not in first(p1) cannot go to p1 and a character ** in first(p1) cannot go to p2, either because p1 will accept ** (headfail) or because it is not in first(p2) (disjoint). ** (The second case is not valid if p1 accepts the empty string, ** as then there is no character at all...) ** - when p2 is empty and opt is true; a IPartialCommit can reuse ** the Choice already active in the stack. */ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, const Charset *fl) { int emptyp2 = (p2->tag == TTrue); Charset cs1, cs2; int e1 = getfirst(p1, fullset, &cs1); if (headfail(p1) || (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { /* == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ int test = codetestset(compst, &cs1, 0); int jmp = NOINST; codegen(compst, p1, 0, test, fl); if (!emptyp2) jmp = addoffsetinst(compst, IJmp); jumptohere(compst, test); codegen(compst, p2, opt, NOINST, fl); jumptohere(compst, jmp); } else if (opt && emptyp2) { /* p1? == IPartialCommit; p1 */ jumptohere(compst, addoffsetinst(compst, IPartialCommit)); codegen(compst, p1, 1, NOINST, fullset); } else { /* == test(first(p1)) -> L1; choice L1; ; commit L2; L1: ; L2: */ int pcommit; int test = codetestset(compst, &cs1, e1); int pchoice = addoffsetinst(compst, IChoice); codegen(compst, p1, emptyp2, test, fullset); pcommit = addoffsetinst(compst, ICommit); jumptohere(compst, pchoice); jumptohere(compst, test); codegen(compst, p2, opt, NOINST, fl); jumptohere(compst, pcommit); } } /* ** And predicate ** optimization: fixedlen(p) = n ==> <&p> ==

; behind n ** (valid only when 'p' has no captures) */ static void codeand (CompileState *compst, TTree *tree, int tt) { int n = fixedlen(tree); if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { codegen(compst, tree, 0, tt, fullset); if (n > 0) addinstruction(compst, IBehind, n); } else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ int pcommit; int pchoice = addoffsetinst(compst, IChoice); codegen(compst, tree, 0, tt, fullset); pcommit = addoffsetinst(compst, IBackCommit); jumptohere(compst, pchoice); addinstruction(compst, IFail, 0); jumptohere(compst, pcommit); } } /* ** Captures: if pattern has fixed (and not too big) length, and it ** has no nested captures, use a single IFullCapture instruction ** after the match; otherwise, enclose the pattern with OpenCapture - ** CloseCapture. */ static void codecapture (CompileState *compst, TTree *tree, int tt, const Charset *fl) { int len = fixedlen(sib1(tree)); if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { codegen(compst, sib1(tree), 0, tt, fl); addinstcap(compst, IFullCapture, tree->cap, tree->key, len); } else { addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); codegen(compst, sib1(tree), 0, tt, fl); addinstcap(compst, ICloseCapture, Cclose, 0, 0); } } static void coderuntime (CompileState *compst, TTree *tree, int tt) { addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); codegen(compst, sib1(tree), 0, tt, fullset); addinstcap(compst, ICloseRunTime, Cclose, 0, 0); } /* ** Create a jump to 'test' and fix 'test' to jump to next instruction */ static void closeloop (CompileState *compst, int test) { int jmp = addoffsetinst(compst, IJmp); jumptohere(compst, test); jumptothere(compst, jmp, test); } /* ** Try repetition of charsets: ** For an empty set, repetition of fail is a no-op; ** For any or char, code a tight loop; ** For generic charset, use a span instruction. */ static int coderepcharset (CompileState *compst, TTree *tree) { switch (tree->tag) { case TFalse: return 1; /* 'fail*' is a no-op */ case TAny: { /* L1: testany -> L2; any; jmp L1; L2: */ int test = addoffsetinst(compst, ITestAny); addinstruction(compst, IAny, 0); closeloop(compst, test); return 1; } case TChar: { /* L1: testchar c -> L2; any; jmp L1; L2: */ int test = addoffsetinst(compst, ITestChar); getinstr(compst, test).i.aux1 = tree->u.n; addinstruction(compst, IAny, 0); closeloop(compst, test); return 1; } case TSet: { /* regular set */ charsetinfo info; int i = addinstruction(compst, ISpan, 0); tree2cset(tree, &info); addcharset(compst, i, &info); return 1; } default: return 0; /* not a charset */ } } /* ** Repetion; optimizations: ** When pattern is a charset, use special code. ** When pattern is head fail, or if it starts with characters that ** are disjoint from what follows the repetions, a simple test ** is enough (a fail inside the repetition would backtrack to fail ** again in the following pattern, so there is no need for a choice). ** When 'opt' is true, the repetion can reuse the Choice already ** active in the stack. */ static void coderep (CompileState *compst, TTree *tree, int opt, const Charset *fl) { if (!coderepcharset(compst, tree)) { Charset st; int e1 = getfirst(tree, fullset, &st); if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { /* L1: test (fail(p1)) -> L2;

; jmp L1; L2: */ int test = codetestset(compst, &st, 0); codegen(compst, tree, 0, test, fullset); closeloop(compst, test); } else { /* test(fail(p1)) -> L2; choice L2; L1:

; partialcommit L1; L2: */ /* or (if 'opt'): partialcommit L1; L1:

; partialcommit L1; */ int commit, l2; int test = codetestset(compst, &st, e1); int pchoice = NOINST; if (opt) jumptohere(compst, addoffsetinst(compst, IPartialCommit)); else pchoice = addoffsetinst(compst, IChoice); l2 = gethere(compst); codegen(compst, tree, 0, NOINST, fullset); commit = addoffsetinst(compst, IPartialCommit); jumptothere(compst, commit, l2); jumptohere(compst, pchoice); jumptohere(compst, test); } } } /* ** Not predicate; optimizations: ** In any case, if first test fails, 'not' succeeds, so it can jump to ** the end. If pattern is headfail, that is all (it cannot fail ** in other parts); this case includes 'not' of simple sets. Otherwise, ** use the default code (a choice plus a failtwice). */ static void codenot (CompileState *compst, TTree *tree) { Charset st; int e = getfirst(tree, fullset, &st); int test = codetestset(compst, &st, e); if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ addinstruction(compst, IFail, 0); else { /* test(fail(p))-> L1; choice L1;

; failtwice; L1: */ int pchoice = addoffsetinst(compst, IChoice); codegen(compst, tree, 0, NOINST, fullset); addinstruction(compst, IFailTwice, 0); jumptohere(compst, pchoice); } jumptohere(compst, test); } /* ** change open calls to calls, using list 'positions' to find ** correct offsets; also optimize tail calls */ static void correctcalls (CompileState *compst, int *positions, int from, int to) { int i; Instruction *code = compst->p->code; for (i = from; i < to; i += sizei(&code[i])) { if (code[i].i.code == IOpenCall) { int n = code[i].i.aux2.key; /* rule number */ int rule = positions[n]; /* rule position */ assert(rule == from || code[rule - 1].i.code == IRet); if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ code[i].i.code = IJmp; /* tail call */ else code[i].i.code = ICall; jumptothere(compst, i, rule); /* call jumps to respective rule */ } } assert(i == to); } /* ** Code for a grammar: ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: */ static void codegrammar (CompileState *compst, TTree *grammar) { int positions[MAXRULES]; int rulenumber = 0; TTree *rule; int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ int start = gethere(compst); /* here starts the initial rule */ jumptohere(compst, firstcall); for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { TTree *r = sib1(rule); assert(r->tag == TXInfo); positions[rulenumber++] = gethere(compst); /* save rule position */ codegen(compst, sib1(r), 0, NOINST, fullset); /* code rule */ addinstruction(compst, IRet, 0); } assert(rule->tag == TTrue); jumptohere(compst, jumptoend); correctcalls(compst, positions, start, gethere(compst)); } static void codecall (CompileState *compst, TTree *call) { int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ assert(sib1(sib2(call))->tag == TXInfo); getinstr(compst, c).i.aux2.key = sib1(sib2(call))->u.n; /* rule number */ } /* ** Code first child of a sequence ** (second child is called in-place to allow tail call) ** Return 'tt' for second child */ static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, int tt, const Charset *fl) { if (needfollow(p1)) { Charset fl1; getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ codegen(compst, p1, 0, tt, &fl1); } else /* use 'fullset' as follow */ codegen(compst, p1, 0, tt, fullset); if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ return NOINST; /* invalidate test */ else return tt; /* else 'tt' still protects sib2 */ } /* ** Main code-generation function: dispatch to auxiliar functions ** according to kind of tree. ('needfollow' should return true ** only for consructions that use 'fl'.) */ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, const Charset *fl) { tailcall: switch (tree->tag) { case TChar: codechar(compst, tree->u.n, tt); break; case TAny: addinstruction(compst, IAny, 0); break; case TSet: codecharset(compst, tree, tt); break; case TTrue: break; case TFalse: addinstruction(compst, IFail, 0); break; case TUTFR: codeutfr(compst, tree); break; case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; case TRep: coderep(compst, sib1(tree), opt, fl); break; case TBehind: codebehind(compst, tree); break; case TNot: codenot(compst, sib1(tree)); break; case TAnd: codeand(compst, sib1(tree), tt); break; case TCapture: codecapture(compst, tree, tt, fl); break; case TRunTime: coderuntime(compst, tree, tt); break; case TGrammar: codegrammar(compst, tree); break; case TCall: codecall(compst, tree); break; case TSeq: { tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ /* codegen(compst, p2, opt, tt, fl); */ tree = sib2(tree); goto tailcall; } default: assert(0); } } /* ** Optimize jumps and other jump-like instructions. ** * Update labels of instructions with labels to their final ** destinations (e.g., choice L1; ... L1: jmp L2: becomes ** choice L2) ** * Jumps to other instructions that do jumps become those ** instructions (e.g., jump to return becomes a return; jump ** to commit becomes a commit) */ static void peephole (CompileState *compst) { Instruction *code = compst->p->code; int i; for (i = 0; i < compst->ncode; i += sizei(&code[i])) { redo: switch (code[i].i.code) { case IChoice: case ICall: case ICommit: case IPartialCommit: case IBackCommit: case ITestChar: case ITestSet: case ITestAny: { /* instructions with labels */ jumptothere(compst, i, finallabel(code, i)); /* optimize label */ break; } case IJmp: { int ft = finaltarget(code, i); switch (code[ft].i.code) { /* jumping to what? */ case IRet: case IFail: case IFailTwice: case IEnd: { /* instructions with unconditional implicit jumps */ code[i] = code[ft]; /* jump becomes that instruction */ code[i + 1].i.code = IEmpty; /* 'no-op' for target position */ break; } case ICommit: case IPartialCommit: case IBackCommit: { /* inst. with unconditional explicit jumps */ int fft = finallabel(code, ft); code[i] = code[ft]; /* jump becomes that instruction... */ jumptothere(compst, i, fft); /* but must correct its offset */ goto redo; /* reoptimize its label */ } default: { jumptothere(compst, i, ft); /* optimize label */ break; } } break; } default: break; } } assert(code[i - 1].i.code == IEnd); } /* ** Compile a pattern. 'size' is the size of the pattern's tree, ** which gives a hint for the size of the final code. */ Instruction *compile (lua_State *L, Pattern *p, uint size) { CompileState compst; compst.p = p; compst.ncode = 0; compst.L = L; newcode(L, p, size/2u + 2); /* set initial size */ codegen(&compst, p->tree, 0, NOINST, fullset); addinstruction(&compst, IEnd, 0); realloccode(L, p, compst.ncode); /* set final size */ peephole(&compst); return p->code; } /* }====================================================== */ lpeg-1.1.0/test.lua0000775000175000017500000014301514446336477014067 0ustar robertoroberto#!/usr/bin/env lua -- require"strict" -- just to be pedantic local m = require"lpeg" -- for general use local a, b, c, d, e, f, g, p, t -- compatibility with Lua 5.2 local unpack = rawget(table, "unpack") or unpack local loadstring = rawget(_G, "loadstring") or load local any = m.P(1) local space = m.S" \t\n"^0 local function checkeq (x, y, p) if p then print(x,y) end if type(x) ~= "table" then assert(x == y) else for k,v in pairs(x) do checkeq(v, y[k], p) end for k,v in pairs(y) do checkeq(v, x[k], p) end end end local mt = getmetatable(m.P(1)) local allchar = {} for i=0,255 do allchar[i + 1] = i end allchar = string.char(unpack(allchar)) assert(#allchar == 256) local function cs2str (c) return m.match(m.Cs((c + m.P(1)/"")^0), allchar) end local function eqcharset (c1, c2) assert(cs2str(c1) == cs2str(c2)) end print"General tests for LPeg library" assert(type(m.version) == "string") print(m.version) assert(m.type("alo") ~= "pattern") assert(m.type(io.input) ~= "pattern") assert(m.type(m.P"alo") == "pattern") -- tests for some basic optimizations assert(m.match(m.P(false) + "a", "a") == 2) assert(m.match(m.P(true) + "a", "a") == 1) assert(m.match("a" + m.P(false), "b") == nil) assert(m.match("a" + m.P(true), "b") == 1) assert(m.match(m.P(false) * "a", "a") == nil) assert(m.match(m.P(true) * "a", "a") == 2) assert(m.match("a" * m.P(false), "a") == nil) assert(m.match("a" * m.P(true), "a") == 2) assert(m.match(#m.P(false) * "a", "a") == nil) assert(m.match(#m.P(true) * "a", "a") == 2) assert(m.match("a" * #m.P(false), "a") == nil) assert(m.match("a" * #m.P(true), "a") == 2) assert(m.match(m.P(1)^0, "abcd") == 5) assert(m.match(m.S("")^0, "abcd") == 1) -- tests for locale do assert(m.locale(m) == m) local t = {} assert(m.locale(t, m) == t) local x = m.locale() for n,v in pairs(x) do assert(type(n) == "string") eqcharset(v, m[n]) end end assert(m.match(3, "aaaa")) assert(m.match(4, "aaaa")) assert(not m.match(5, "aaaa")) assert(m.match(-3, "aa")) assert(not m.match(-3, "aaa")) assert(not m.match(-3, "aaaa")) assert(not m.match(-4, "aaaa")) assert(m.P(-5):match"aaaa") assert(m.match("a", "alo") == 2) assert(m.match("al", "alo") == 3) assert(not m.match("alu", "alo")) assert(m.match(true, "") == 1) local digit = m.S"0123456789" local upper = m.S"ABCDEFGHIJKLMNOPQRSTUVWXYZ" local lower = m.S"abcdefghijklmnopqrstuvwxyz" local letter = m.S"" + upper + lower local alpha = letter + digit + m.R() eqcharset(m.S"", m.P(false)) eqcharset(upper, m.R("AZ")) eqcharset(lower, m.R("az")) eqcharset(upper + lower, m.R("AZ", "az")) eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90")) eqcharset(digit, m.S"01234567" + "8" + "9") eqcharset(upper, letter - lower) eqcharset(m.S(""), m.R()) assert(cs2str(m.S("")) == "") eqcharset(m.S"\0", "\0") eqcharset(m.S"\1\0\2", m.R"\0\2") eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") eqcharset(m.S("\0\255"), m.P"\0" + "\255") -- charset extremes local word = alpha^1 * (1 - alpha)^0 assert((word^0 * -1):match"alo alo") assert(m.match(word^1 * -1, "alo alo")) assert(m.match(word^2 * -1, "alo alo")) assert(not m.match(word^3 * -1, "alo alo")) assert(not m.match(word^-1 * -1, "alo alo")) assert(m.match(word^-2 * -1, "alo alo")) assert(m.match(word^-3 * -1, "alo alo")) local eos = m.P(-1) assert(m.match(digit^0 * letter * digit * eos, "1298a1")) assert(not m.match(digit^0 * letter * eos, "1257a1")) b = { [1] = "(" * (((1 - m.S"()") + #m.P"(" * m.V(1))^0) * ")" } assert(m.match(b, "(al())()")) assert(not m.match(b * eos, "(al())()")) assert(m.match(b * eos, "((al())()(é))")) assert(not m.match(b, "(al()()")) assert(not m.match(letter^1 - "for", "foreach")) assert(m.match(letter^1 - ("for" * eos), "foreach")) assert(not m.match(letter^1 - ("for" * eos), "for")) function basiclookfor (p) return m.P { [1] = p + (1 * m.V(1)) } end function caplookfor (p) return basiclookfor(p:C()) end assert(m.match(caplookfor(letter^1), " 4achou123...") == "achou") a = {m.match(caplookfor(letter^1)^0, " two words, one more ")} checkeq(a, {"two", "words", "one", "more"}) assert(m.match( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7) a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "123")} checkeq(a, {"123", "d"}) -- bug in LPeg 0.12 (nil value does not create a 'ktable') assert(m.match(m.Cc(nil), "") == nil) a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "abcd")} checkeq(a, {"abcd", "l"}) a = {m.match(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} checkeq(a, {10,20,30,2}) a = {m.match(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} checkeq(a, {1,10,20,30,2}) a = m.match(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') checkeq(a, {1,10,20,30,2}) a = m.match(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') checkeq(a, {1,7,8,10,20,30,2}) a = {m.match(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')} checkeq(a, {1,2,3,4}) a = {m.match(m.Cp() * letter^1 * m.Cp(), "abcd")} checkeq(a, {1, 5}) t = {m.match({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")} checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""}) -- bug in 0.12 ('hascapture' did not check for captures inside a rule) do local pat = m.P{ 'S'; S1 = m.C('abc') + 3, S = #m.V('S1') -- rule has capture, but '#' must ignore it } assert(pat:match'abc' == 1) end -- bug: loop in 'hascaptures' do local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'}) assert(p:match("xxx") == "") end -- test for small capture boundary for i = 250,260 do assert(#m.match(m.C(i), string.rep('a', i)) == i) assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i) end -- tests for any*n and any*-n for n = 1, 550, 13 do local x_1 = string.rep('x', n - 1) local x = x_1 .. 'a' assert(not m.P(n):match(x_1)) assert(m.P(n):match(x) == n + 1) assert(n < 4 or m.match(m.P(n) + "xxx", x_1) == 4) assert(m.C(n):match(x) == x) assert(m.C(m.C(n)):match(x) == x) assert(m.P(-n):match(x_1) == 1) assert(not m.P(-n):match(x)) assert(n < 13 or m.match(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20) local n3 = math.floor(n/3) assert(m.match(n3 * m.Cp() * n3 * n3, x) == n3 + 1) end -- true values assert(m.P(0):match("x") == 1) assert(m.P(0):match("") == 1) assert(m.C(0):match("x") == "") assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1) assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0) assert(m.match(m.C(m.P(2)^1), "abcde") == "abcd") p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4 -- test for alternation optimization assert(m.match(m.P"a"^1 + "ab" + m.P"x"^0, "ab") == 2) assert(m.match((m.P"a"^1 + "ab" + m.P"x"^0 * 1)^0, "ab") == 3) assert(m.match(m.P"ab" + "cd" + "" + "cy" + "ak", "98") == 1) assert(m.match(m.P"ab" + "cd" + "ax" + "cy", "ax") == 3) assert(m.match("a" * m.P"b"^0 * "c" + "cd" + "ax" + "cy", "ax") == 3) assert(m.match((m.P"ab" + "cd" + "ax" + "cy")^0, "ax") == 3) assert(m.match(m.P(1) * "x" + m.S"" * "xu" + "ay", "ay") == 3) assert(m.match(m.P"abc" + "cde" + "aka", "aka") == 4) assert(m.match(m.S"abc" * "x" + "cde" + "aka", "ax") == 3) assert(m.match(m.S"abc" * "x" + "cde" + "aka", "aka") == 4) assert(m.match(m.S"abc" * "x" + "cde" + "aka", "cde") == 4) assert(m.match(m.S"abc" * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "ax") == 3) assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "aka") == 4) assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "cde") == 4) assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "ax") == 3) assert(m.match(m.P(1) * "x" + "cde" + m.S"ab" * "ka", "aka") == 4) assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4) assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4) assert(m.match(m.P"eb" + "cd" + m.P"e"^0 + "x", "ee") == 3) assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "abcd") == 3) assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "eeex") == 4) assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "cd") == 3) assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "x") == 1) assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x" + "", "zee") == 1) assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "abcd") == 3) assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "eeex") == 4) assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "cd") == 3) assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "x") == 2) assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x" + "", "zee") == 1) assert(not m.match(("aa" * m.P"bc"^-1 + "aab") * "e", "aabe")) assert(m.match("alo" * (m.P"\n" + -1), "alo") == 4) -- bug in 0.12 (rc1) assert(m.match((m.P"\128\187\191" + m.S"abc")^0, "\128\187\191") == 4) assert(m.match(m.S"\0\128\255\127"^0, string.rep("\0\128\255\127", 10)) == 4*10 + 1) -- optimizations with optional parts assert(m.match(("ab" * -m.P"c")^-1, "abc") == 1) assert(m.match(("ab" * #m.P"c")^-1, "abd") == 1) assert(m.match(("ab" * m.B"c")^-1, "ab") == 1) assert(m.match(("ab" * m.P"cd"^0)^-1, "abcdcdc") == 7) assert(m.match(m.P"ab"^-1 - "c", "abcd") == 3) p = ('Aa' * ('Bb' * ('Cc' * m.P'Dd'^0)^0)^0)^-1 assert(p:match("AaBbCcDdBbCcDdDdDdBb") == 21) -- bug in 0.12.2 -- p = { ('ab' ('c' 'ef'?)*)? } p = m.C(('ab' * ('c' * m.P'ef'^-1)^0)^-1) s = "abcefccefc" assert(s == p:match(s)) pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510" assert(m.match(m.Cs((m.P"1" / "a" + m.P"5" / "b" + m.P"9" / "c" + 1)^0), pi) == m.match(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})^0), pi)) print"+" -- tests for capture optimizations assert(m.match((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5) t = {m.match(((m.P"a" + m.Cp()) * m.P"x")^0, "axxaxx")} checkeq(t, {3, 6}) -- tests for numbered captures p = m.C(1) assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a") assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef") assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc") assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7) a, b, c = m.match(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh") assert(a == "a" and b == "efg" and c == "h") -- test for table captures t = m.match(m.Ct(letter^1), "alo") checkeq(t, {}) t, n = m.match(m.Ct(m.C(letter)^1) * m.Cc"t", "alo") assert(n == "t" and table.concat(t) == "alo") t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") assert(table.concat(t, ";") == "alo;a;l;o") t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") assert(table.concat(t, ";") == "alo;a;l;o") t = m.match(m.Ct(m.Ct((m.Cp() * letter * m.Cp())^1)), "alo") assert(table.concat(t[1], ";") == "1;2;2;3;3;4") t = m.match(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo") checkeq(t, {"alo", "a", "o"}) -- tests for groups p = m.Cg(1) -- no capture assert(p:match('x') == 'x') p = m.Cg(m.P(true)/function () end * 1) -- no value assert(p:match('x') == 'x') p = m.Cg(m.Cg(m.Cg(m.C(1)))) assert(p:match('x') == 'x') p = m.Cg(m.Cg(m.Cg(m.C(1))^0) * m.Cg(m.Cc(1) * m.Cc(2))) t = {p:match'abc'} checkeq(t, {'a', 'b', 'c', 1, 2}) p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)^0 * m.Cg(m.Cc(20), "ho")) t = p:match'' checkeq(t, {hi = 10, ho = 20}) t = p:match'abc' checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'}) -- non-string group names p = m.Ct(m.Cg(1, print) * m.Cg(1, 23.5) * m.Cg(1, io)) t = p:match('abcdefghij') assert(t[print] == 'a' and t[23.5] == 'b' and t[io] == 'c') -- test for error messages local function checkerr (msg, f, ...) local st, err = pcall(f, ...) assert(not st and m.match({ m.P(msg) + 1 * m.V(1) }, err)) end checkerr("rule '1' may be left recursive", m.match, { m.V(1) * 'a' }, "a") checkerr("rule '1' used outside a grammar", m.match, m.V(1), "") checkerr("rule 'hiii' used outside a grammar", m.match, m.V('hiii'), "") checkerr("rule 'hiii' undefined in given grammar", m.match, { m.V('hiii') }, "") checkerr("undefined in given grammar", m.match, { m.V{} }, "") checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} }) checkerr("grammar has no initial rule", m.P, { [print] = {} }) -- grammar with a long call chain before left recursion p = {'a', a = m.V'b' * m.V'c' * m.V'd' * m.V'a', b = m.V'c', c = m.V'd', d = m.V'e', e = m.V'f', f = m.V'g', g = m.P'' } checkerr("rule 'a' may be left recursive", m.match, p, "a") -- Bug in peephole optimization of LPeg 0.12 (IJmp -> ICommit) -- the next grammar has an original sequence IJmp -> ICommit -> IJmp L1 -- that is optimized to ICommit L1 p = m.P { (m.P {m.P'abc'} + 'ayz') * m.V'y'; y = m.P'x' } assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') do print "testing large dynamic Cc" local lim = 2^16 - 1 local c = 0 local function seq (n) if n == 1 then c = c + 1; return m.Cc(c) else local m = math.floor(n / 2) return seq(m) * seq(n - m) end end p = m.Ct(seq(lim)) t = p:match('') assert(t[lim] == lim) checkerr("too many", function () p = p / print end) checkerr("too many", seq, lim + 1) end do -- nesting of captures too deep local p = m.C(1) for i = 1, 300 do p = m.Ct(p) end checkerr("too deep", p.match, p, "x") end -- tests for non-pattern as arguments to pattern functions p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } assert(m.match(p, "aaabaac") == 7) p = m.P'abc' * 2 * -5 * true * 'de' -- mix of numbers and strings and booleans assert(p:match("abc01de") == 8) assert(p:match("abc01de3456") == nil) p = 'abc' * (2 * (-5 * (true * m.P'de'))) assert(p:match("abc01de") == 8) assert(p:match("abc01de3456") == nil) p = { m.V(2), m.P"abc" } * (m.P{ "xx", xx = m.P"xx" } + { "x", x = m.P"a" * m.V"x" + "" }) assert(p:match("abcaaaxx") == 7) assert(p:match("abcxx") == 6) -- a large table capture t = m.match(m.Ct(m.C('a')^0), string.rep("a", 10000)) assert(#t == 10000 and t[1] == 'a' and t[#t] == 'a') print('+') -- bug in 0.10 (rechecking a grammar, after tail-call optimization) m.P{ m.P { (m.P(3) + "xuxu")^0 * m.V"xuxu", xuxu = m.P(1) } } local V = m.V local Space = m.S(" \n\t")^0 local Number = m.C(m.R("09")^1) * Space local FactorOp = m.C(m.S("+-")) * Space local TermOp = m.C(m.S("*/")) * Space local Open = "(" * Space local Close = ")" * Space local function f_factor (v1, op, v2, d) assert(d == nil) if op == "+" then return v1 + v2 else return v1 - v2 end end local function f_term (v1, op, v2, d) assert(d == nil) if op == "*" then return v1 * v2 else return v1 / v2 end end G = m.P{ "Exp", Exp = V"Factor" * (FactorOp * V"Factor" % f_factor)^0; Factor = V"Term" * (TermOp * V"Term" % f_term)^0; Term = Number / tonumber + Open * V"Exp" * Close; } G = Space * G * -1 for _, s in ipairs{" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"} do assert(m.match(G, s) == loadstring("return "..s)()) end -- test for grammars (errors deep in calling non-terminals) g = m.P{ [1] = m.V(2) + "a", [2] = "a" * m.V(3) * "x", [3] = "b" * m.V(3) + "c" } assert(m.match(g, "abbbcx") == 7) assert(m.match(g, "abbbbx") == 2) -- tests for \0 assert(m.match(m.R("\0\1")^1, "\0\1\0") == 4) assert(m.match(m.S("\0\1ab")^1, "\0\1\0a") == 5) assert(m.match(m.P(1)^3, "\0\1\0a") == 5) assert(not m.match(-4, "\0\1\0a")) assert(m.match("\0\1\0a", "\0\1\0a") == 5) assert(m.match("\0\0\0", "\0\0\0") == 4) assert(not m.match("\0\0\0", "\0\0")) -- tests for predicates assert(not m.match(-m.P("a") * 2, "alo")) assert(m.match(- -m.P("a") * 2, "alo") == 3) assert(m.match(#m.P("a") * 2, "alo") == 3) assert(m.match(##m.P("a") * 2, "alo") == 3) assert(not m.match(##m.P("c") * 2, "alo")) assert(m.match(m.Cs((##m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") -- fixed length do -- 'and' predicate using fixed length local p = m.C(#("a" * (m.P("bd") + "cd")) * 2) assert(p:match("acd") == "ac") p = #m.P{ "a" * m.V(2), m.P"b" } * 2 assert(p:match("abc") == 3) p = #(m.P"abc" * m.B"c") assert(p:match("abc") == 1 and not p:match("ab")) p = m.P{ "a" * m.V(2), m.P"b"^1 } checkerr("pattern may not have fixed length", m.B, p) p = "abc" * (m.P"b"^1 + m.P"a"^0) checkerr("pattern may not have fixed length", m.B, p) end p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) p = -m.P'a' * m.Cc(10) + #m.P'a' * m.Cc(20) assert(p:match('a') == 20 and p:match('') == 10 and p:match('b') == 10) -- look-behind predicate assert(not m.match(m.B'a', 'a')) assert(m.match(1 * m.B'a', 'a') == 2) assert(not m.match(m.B(1), 'a')) assert(m.match(1 * m.B(1), 'a') == 2) assert(m.match(-m.B(1), 'a') == 1) assert(m.match(m.B(250), string.rep('a', 250)) == nil) assert(m.match(250 * m.B(250), string.rep('a', 250)) == 251) -- look-behind with an open call checkerr("pattern may not have fixed length", m.B, m.V'S1') checkerr("too long to look behind", m.B, 260) B = #letter * -m.B(letter) + -letter * m.B(letter) x = m.Ct({ (B * m.Cp())^-1 * (1 * m.V(1) + m.P(true)) }) checkeq(m.match(x, 'ar cal c'), {1,3,4,7,9,10}) checkeq(m.match(x, ' ar cal '), {2,4,5,8}) checkeq(m.match(x, ' '), {}) checkeq(m.match(x, 'aloalo'), {1,7}) assert(m.match(B, "a") == 1) assert(m.match(1 * B, "a") == 2) assert(not m.B(1 - letter):match("")) assert((-m.B(letter)):match("") == 1) assert((4 * m.B(letter, 4)):match("aaaaaaaa") == 5) assert(not (4 * m.B(#letter * 5)):match("aaaaaaaa")) assert((4 * -m.B(#letter * 5)):match("aaaaaaaa") == 5) -- look-behind with grammars assert(m.match('a' * m.B{'x', x = m.P(3)}, 'aaa') == nil) assert(m.match('aa' * m.B{'x', x = m.P('aaa')}, 'aaaa') == nil) assert(m.match('aaa' * m.B{'x', x = m.P('aaa')}, 'aaaaa') == 4) -- bug in 0.9 assert(m.match(('a' * #m.P'b'), "ab") == 2) assert(not m.match(('a' * #m.P'b'), "a")) assert(not m.match(#m.S'567', "")) assert(m.match(#m.S'567' * 1, "6") == 2) -- tests for Tail Calls p = m.P{ 'a' * m.V(1) + '' } assert(p:match(string.rep('a', 1000)) == 1001) -- create a grammar for a simple DFA for even number of 0s and 1s -- -- ->1 <---0---> 2 -- ^ ^ -- | | -- 1 1 -- | | -- V V -- 3 <---0---> 4 -- -- this grammar should keep no backtracking information p = m.P{ [1] = '0' * m.V(2) + '1' * m.V(3) + -1, [2] = '0' * m.V(1) + '1' * m.V(4), [3] = '0' * m.V(4) + '1' * m.V(1), [4] = '0' * m.V(3) + '1' * m.V(2), } assert(p:match(string.rep("00", 10000))) assert(p:match(string.rep("01", 10000))) assert(p:match(string.rep("011", 10000))) assert(not p:match(string.rep("011", 10000) .. "1")) assert(not p:match(string.rep("011", 10001))) -- this grammar does need backtracking info. local lim = 10000 p = m.P{ '0' * m.V(1) + '0' } checkerr("stack overflow", m.match, p, string.rep("0", lim)) m.setmaxstack(2*lim) checkerr("stack overflow", m.match, p, string.rep("0", lim)) m.setmaxstack(2*lim + 4) assert(m.match(p, string.rep("0", lim)) == lim + 1) -- this repetition should not need stack space (only the call does) p = m.P{ ('a' * m.V(1))^0 * 'b' + 'c' } m.setmaxstack(200) assert(p:match(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362) m.setmaxstack(100) -- restore low limit -- tests for optional start position assert(m.match("a", "abc", 1)) assert(m.match("b", "abc", 2)) assert(m.match("c", "abc", 3)) assert(not m.match(1, "abc", 4)) assert(m.match("a", "abc", -3)) assert(m.match("b", "abc", -2)) assert(m.match("c", "abc", -1)) assert(m.match("abc", "abc", -4)) -- truncate to position 1 assert(m.match("", "abc", 10)) -- empty string is everywhere! assert(m.match("", "", 10)) assert(not m.match(1, "", 1)) assert(not m.match(1, "", -1)) assert(not m.match(1, "", 0)) print("+") -- tests for argument captures checkerr("invalid argument", m.Carg, 0) checkerr("invalid argument", m.Carg, -1) checkerr("invalid argument", m.Carg, 2^18) checkerr("absent extra argument #1", m.match, m.Carg(1), 'a', 1) assert(m.match(m.Carg(1), 'a', 1, print) == print) x = {m.match(m.Carg(1) * m.Carg(2), '', 1, 10, 20)} checkeq(x, {10, 20}) assert(m.match(m.Cmt(m.Cg(m.Carg(3), "a") * m.Cmt(m.Cb("a"), function (s,i,x) assert(s == "a" and i == 1); return i, x+1 end) * m.Carg(2), function (s,i,a,b,c) assert(s == "a" and i == 1 and c == nil); return i, 2*a + 3*b end) * "a", "a", 1, false, 100, 1000) == 2*1001 + 3*100) -- tests for Lua functions t = {} s = "" p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; return nil end) * false s = "hi, this is a test" assert(m.match(((p - m.P(-1)) + 2)^0, s) == string.len(s) + 1) assert(#t == string.len(s)/2 and t[1] == 1 and t[2] == 3) assert(not m.match(p, s)) p = mt.__add(function (s, i) return i end, function (s, i) return nil end) assert(m.match(p, "alo")) p = mt.__mul(function (s, i) return i end, function (s, i) return nil end) assert(not m.match(p, "alo")) t = {} p = function (s1, i) assert(s == s1); t[#t + 1] = i; return i end s = "hi, this is a test" assert(m.match((m.P(1) * p)^0, s) == string.len(s) + 1) assert(#t == string.len(s) and t[1] == 2 and t[2] == 3) t = {} p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; return i <= s1:len() and i end) * 1 s = "hi, this is a test" assert(m.match(p^0, s) == string.len(s) + 1) assert(#t == string.len(s) + 1 and t[1] == 1 and t[2] == 2) p = function (s1, i) return m.match(m.P"a"^1, s1, i) end assert(m.match(p, "aaaa") == 5) assert(m.match(p, "abaa") == 2) assert(not m.match(p, "baaa")) checkerr("invalid position", m.match, function () return 2^20 end, s) checkerr("invalid position", m.match, function () return 0 end, s) checkerr("invalid position", m.match, function (s, i) return i - 1 end, s) checkerr("invalid position", m.match, m.P(1)^0 * function (_, i) return i - 1 end, s) assert(m.match(m.P(1)^0 * function (_, i) return i end * -1, s)) checkerr("invalid position", m.match, m.P(1)^0 * function (_, i) return i + 1 end, s) assert(m.match(m.P(function (s, i) return s:len() + 1 end) * -1, s)) checkerr("invalid position", m.match, m.P(function (s, i) return s:len() + 2 end) * -1, s) assert(not m.match(m.P(function (s, i) return s:len() end) * -1, s)) assert(m.match(m.P(1)^0 * function (_, i) return true end, s) == string.len(s) + 1) for i = 1, string.len(s) + 1 do assert(m.match(function (_, _) return i end, s) == i) end p = (m.P(function (s, i) return i%2 == 0 and i end) * 1 + m.P(function (s, i) return i%2 ~= 0 and i + 2 <= s:len() and i end) * 3)^0 * -1 assert(p:match(string.rep('a', 14000))) -- tests for Function Replacements f = function (a, ...) if a ~= "x" then return {a, ...} end end t = m.match(m.C(1)^0/f, "abc") checkeq(t, {"a", "b", "c"}) t = m.match(m.C(1)^0/f/f, "abc") checkeq(t, {{"a", "b", "c"}}) t = m.match(m.P(1)^0/f/f, "abc") -- no capture checkeq(t, {{"abc"}}) t = m.match((m.P(1)^0/f * m.Cp())/f, "abc") checkeq(t, {{"abc"}, 4}) t = m.match((m.C(1)^0/f * m.Cp())/f, "abc") checkeq(t, {{"a", "b", "c"}, 4}) t = m.match((m.C(1)^0/f * m.Cp())/f, "xbc") checkeq(t, {4}) t = m.match(m.C(m.C(1)^0)/f, "abc") checkeq(t, {"abc", "a", "b", "c"}) g = function (...) return 1, ... end t = {m.match(m.C(1)^0/g/g, "abc")} checkeq(t, {1, 1, "a", "b", "c"}) t = {m.match(m.Cc(nil,nil,4) * m.Cc(nil,3) * m.Cc(nil, nil) / g / g, "")} t1 = {1,1,nil,nil,4,nil,3,nil,nil} for i=1,10 do assert(t[i] == t1[i]) end -- bug in 0.12.2: ktable with only nil could be eliminated when joining -- with a pattern without ktable assert((m.P"aaa" * m.Cc(nil)):match"aaa" == nil) t = {m.match((m.C(1) / function (x) return x, x.."x" end)^0, "abc")} checkeq(t, {"a", "ax", "b", "bx", "c", "cx"}) t = m.match(m.Ct((m.C(1) / function (x,y) return y, x end * m.Cc(1))^0), "abc") checkeq(t, {nil, "a", 1, nil, "b", 1, nil, "c", 1}) -- tests for Query Replacements assert(m.match(m.C(m.C(1)^0)/{abc = 10}, "abc") == 10) assert(m.match(m.C(1)^0/{a = 10}, "abc") == 10) assert(m.match(m.S("ba")^0/{ab = 40}, "abc") == 40) t = m.match(m.Ct((m.S("ba")/{a = 40})^0), "abc") checkeq(t, {40}) assert(m.match(m.Cs((m.C(1)/{a=".", d=".."})^0), "abcdde") == ".bc....e") assert(m.match(m.Cs((m.C(1)/{f="."})^0), "abcdde") == "abcdde") assert(m.match(m.Cs((m.C(1)/{d="."})^0), "abcdde") == "abc..e") assert(m.match(m.Cs((m.C(1)/{e="."})^0), "abcdde") == "abcdd.") assert(m.match(m.Cs((m.C(1)/{e=".", f="+"})^0), "eefef") == "..+.+") assert(m.match(m.Cs((m.C(1))^0), "abcdde") == "abcdde") assert(m.match(m.Cs(m.C(m.C(1)^0)), "abcdde") == "abcdde") assert(m.match(1 * m.Cs(m.P(1)^0), "abcdde") == "bcdde") assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "abcdde") == "abcdde") assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "0ab0b0") == "xabxbx") assert(m.match(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})^0), "b0a0b") == "3xax3") assert(m.match(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3) assert(m.match(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3) assert(m.match(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0") assert(m.match(m.Cs((m.P(1) / "%0")^0), "abcd") == "abcd") assert(m.match(m.Cs((m.P(1) / "%0.%0")^0), "abcd") == "a.ab.bc.cd.d") assert(m.match(m.Cs((m.P("a") / "%0.%0" + 1)^0), "abcad") == "a.abca.ad") assert(m.match(m.C("a") / "%1%%%0", "a") == "a%a") assert(m.match(m.Cs((m.P(1) / ".xx")^0), "abcd") == ".xx.xx.xx.xx") assert(m.match(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") == "411 - abc ") assert(m.match(m.P(1)/"%0", "abc") == "a") checkerr("invalid capture index", m.match, m.P(1)/"%1", "abc") checkerr("invalid capture index", m.match, m.P(1)/"%9", "abc") p = m.C(1) p = p * p; p = p * p; p = p * p * m.C(1) / "%9 - %1" assert(p:match("1234567890") == "9 - 1") assert(m.match(m.Cc(print), "") == print) -- too many captures (just ignore extra ones) p = m.C(1)^0 / "%2-%9-%0-%9" assert(p:match"01234567890123456789" == "1-8-01234567890123456789-8") s = string.rep("12345678901234567890", 20) assert(m.match(m.C(1)^0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3") -- string captures with non-string subcaptures p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1" assert(p:match'x' == 'alo - x - alo') checkerr("invalid capture value (a boolean)", m.match, m.Cc(true) / "%1", "a") -- long strings for string capture l = 10000 s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' assert(p:match(s) == string.rep('c', l) .. string.rep('b', l) .. string.rep('a', l)) print"+" -- accumulator capture function f (x) return x + 1 end assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7) assert(m.match(m.Cc(0) * (m.C(1) % f)^0, "alo alo") == 7) t = {m.match(m.Cf(m.Cc(1,2,3), error), "")} checkeq(t, {1}) p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R"az"^1) * "=" * m.C(m.R"az"^1) * ";")^0, rawset) t = p:match("a=b;c=du;xux=yuy;") checkeq(t, {a="b", c="du", xux="yuy"}) -- errors in fold capture -- no initial capture checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa') -- no initial capture (very long match forces fold to be a pair open-close) checkerr("no initial value", m.match, m.Cf(m.P(500), print), string.rep('a', 600)) -- errors in accumulator capture -- no initial capture checkerr("no previous value", m.match, m.P(5) % print, 'aaaaaa') -- no initial capture (very long match forces fold to be a pair open-close) checkerr("no previous value", m.match, m.P(500) % print, string.rep('a', 600)) -- tests for loop checker local function isnullable (p) checkerr("may accept empty string", function (p) return p^0 end, m.P(p)) end isnullable(m.P("x")^-4) assert(m.match(((m.P(0) + 1) * m.S"al")^0, "alo") == 3) assert(m.match((("x" + #m.P(1))^-4 * m.S"al")^0, "alo") == 3) isnullable("") isnullable(m.P("x")^0) isnullable(m.P("x")^-1) isnullable(m.P("x") + 1 + 2 + m.P("a")^-1) isnullable(-m.P("ab")) isnullable(- -m.P("ab")) isnullable(# #(m.P("ab") + "xy")) isnullable(- #m.P("ab")^0) isnullable(# -m.P("ab")^1) isnullable(#m.V(3)) isnullable(m.V(3) + m.V(1) + m.P('a')^-1) isnullable({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)}) assert(m.match(m.P{[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)}^0, "abc") == 3) assert(m.match(m.P""^-3, "a") == 1) local function find (p, s) return m.match(basiclookfor(p), s) end local function badgrammar (g, expected) local stat, msg = pcall(m.P, g) assert(not stat) if expected then assert(find(expected, msg)) end end badgrammar({[1] = m.V(1)}, "rule '1'") badgrammar({[1] = m.V(2)}, "rule '2'") -- invalid non-terminal badgrammar({[1] = m.V"x"}, "rule 'x'") -- invalid non-terminal badgrammar({[1] = m.V{}}, "rule '(a table)'") -- invalid non-terminal badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'") -- left-recursive badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'") -- left-recursive badgrammar({[1] = -1 * m.V(1)}, "rule '1'") -- left-recursive badgrammar({[1] = -1 + m.V(1)}, "rule '1'") -- left-recursive badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'") -- left-recursive badgrammar({[1] = 1 * m.V(2)^0, [2] = m.P(0)}, "rule '1'") -- inf. loop badgrammar({ m.V(2), m.V(3)^0, m.P"" }, "rule '2'") -- inf. loop badgrammar({ m.V(2) * m.V(3)^0, m.V(3)^0, m.P"" }, "rule '1'") -- inf. loop badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'") -- inf. loop badgrammar({ -(m.V(1) * 'a') }, "rule '1'") -- inf. loop badgrammar({"x", x = m.P'a'^-1 * m.V"x"}, "rule 'x'") -- left recursive badgrammar({"x", x = m.P'a' * m.V"y"^1, y = #m.P(1)}, "rule 'x'") assert(m.match({'a' * -m.V(1)}, "aaa") == 2) assert(m.match({'a' * -m.V(1)}, "aaaa") == nil) -- good x bad grammars m.P{ ('a' * m.V(1))^-1 } m.P{ -('a' * m.V(1)) } m.P{ ('abc' * m.V(1))^-1 } m.P{ -('abc' * m.V(1)) } badgrammar{ #m.P('abc') * m.V(1) } badgrammar{ -('a' + m.V(1)) } m.P{ #('a' * m.V(1)) } badgrammar{ #('a' + m.V(1)) } m.P{ m.B{ m.P'abc' } * 'a' * m.V(1) } badgrammar{ m.B{ m.P'abc' } * m.V(1) } badgrammar{ ('a' + m.P'bcd')^-1 * m.V(1) } -- simple tests for maximum sizes: local p = m.P"a" for i=1,14 do p = p * p end p = {} for i=1,100 do p[i] = m.P"a" end p = m.P(p) -- strange values for rule labels p = m.P{ "print", print = m.V(print), [print] = m.V(_G), [_G] = m.P"a", } assert(p:match("a")) -- initial rule g = {} for i = 1, 10 do g["i"..i] = "a" * m.V("i"..i+1) end g.i11 = m.P"" for i = 1, 10 do g[1] = "i"..i local p = m.P(g) assert(p:match("aaaaaaaaaaa") == 11 - i + 1) end print "testing back references" checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")) t = p:match("ab") checkeq(t, {"a", "b"}) do -- some basic cases assert(m.match(m.Cg(m.Cc(3), "a") * m.Cb("a"), "a") == 3) assert(m.match(m.Cg(m.C(1), 133) * m.Cb(133), "X") == "X") -- first reference to 'x' should not see the group enclosing it local p = m.Cg(m.Cb('x'), 'x') * m.Cb('x') checkerr("back reference 'x' not found", m.match, p, '') local p = m.Cg(m.Cb('x') * m.C(1), 'x') * m.Cb('x') checkerr("back reference 'x' not found", m.match, p, 'abc') -- reference to 'x' should not see the group enclosed in another capture local s = string.rep("a", 30) local p = (m.C(1)^-4 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x') checkerr("back reference 'x' not found", m.match, p, s) local p = (m.C(1)^-20 * m.Cg(m.C(1), 'x')) / {} * m.Cb('x') checkerr("back reference 'x' not found", m.match, p, s) -- second reference 'k' should refer to 10 and first ref. 'k' p = m.Cg(m.Cc(20), 'k') * m.Cg(m.Cc(10) * m.Cb('k') * m.C(1), 'k') * (m.Cb('k') / function (a,b,c) return a*10 + b + tonumber(c) end) -- 10 * 10 (Cc) + 20 (Cb) + 7 (C) == 127 assert(p:match("756") == 127) end p = m.P(true) for i = 1, 10 do p = p * m.Cg(1, i) end for i = 1, 10 do local p = p * m.Cb(i) assert(p:match('abcdefghij') == string.sub('abcdefghij', i, i)) end t = {} function foo (p) t[#t + 1] = p; return p .. "x" end p = m.Cg(m.C(2) / foo, "x") * m.Cb"x" * m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" x = {p:match'ab'} checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'}) checkeq(t, {'ab', 'ab', 'abx', 'ab', 'abx', 'abxx', 'ab', 'abx', 'abxx', 'abxxx'}) -- tests for match-time captures p = m.P'a' * (function (s, i) return (s:sub(i, i) == 'b') and i + 1 end) + 'acd' assert(p:match('abc') == 3) assert(p:match('acd') == 4) local function id (s, i, ...) return true, ... end do -- run-time capture in an end predicate (should discard its value) local x = 0 function foo (s, i) x = x + 1 return true, x end local p = #(m.Cmt("", foo) * "xx") * m.Cmt("", foo) assert(p:match("xx") == 2) end assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + m.R'09'^1 / string.char + m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") p = m.P{'S', S = m.V'atom' * space + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V'S'^1, id) + m.P(true)) * ")" * space), id), atom = m.Cmt(m.C(m.R("AZ", "az", "09")^1), id) } x = p:match"(a g () ((b) c) (d (e)))" checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}}); x = {(m.Cmt(1, id)^0):match(string.rep('a', 500))} assert(#x == 500) local function id(s, i, x) if x == 'a' then return i, 1, 3, 7 else return nil, 2, 4, 6, 8 end end p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 assert(table.concat{p:match('abababab')} == string.rep('137', 4)) local function ref (s, i, x) return m.match(x, s, i - x:len()) end assert(m.Cmt(m.P(1)^0, ref):match('alo') == 4) assert((m.P(1) * m.Cmt(m.P(1)^0, ref)):match('alo') == 4) assert(not (m.P(1) * m.Cmt(m.C(1)^0, ref)):match('alo')) ref = function (s,i,x) return i == tonumber(x) and i, 'xuxu' end assert(m.Cmt(1, ref):match'2') assert(not m.Cmt(1, ref):match'1') assert(m.Cmt(m.P(1)^0, ref):match'03') function ref (s, i, a, b) if a == b then return i, a:upper() end end p = m.Cmt(m.C(m.R"az"^1) * "-" * m.C(m.R"az"^1), ref) p = (any - p)^0 * p * any^0 * -1 assert(p:match'abbbc-bc ddaa' == 'BC') do -- match-time captures cannot be optimized away local touch = 0 f = m.P(function () touch = touch + 1; return true end) local function check(n) n = n or 1; assert(touch == n); touch = 0 end assert(m.match(f * false + 'b', 'a') == nil); check() assert(m.match(f * false + 'b', '') == nil); check() assert(m.match( (f * 'a')^0 * 'b', 'b') == 2); check() assert(m.match( (f * 'a')^0 * 'b', '') == nil); check() assert(m.match( (f * 'a')^-1 * 'b', 'b') == 2); check() assert(m.match( (f * 'a')^-1 * 'b', '') == nil); check() assert(m.match( ('b' + f * 'a')^-1 * 'b', '') == nil); check() assert(m.match( (m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); check() assert(m.match( (-m.P(1) * m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); check() assert(m.match( (f * 'a' + 'b')^-1 * 'b', '') == nil); check() assert(m.match(f * 'a' + f * 'b', 'b') == 2); check(2) assert(m.match(f * 'a' + f * 'b', 'a') == 2); check(1) assert(m.match(-f * 'a' + 'b', 'b') == 2); check(1) assert(m.match(-f * 'a' + 'b', '') == nil); check(1) end c = '[' * m.Cg(m.P'='^0, "init") * '[' * { m.Cmt(']' * m.C(m.P'='^0) * ']' * m.Cb("init"), function (_, _, s1, s2) return s1 == s2 end) + 1 * m.V(1) } / 0 assert(c:match'[==[]]====]]]]==]===[]' == 18) assert(c:match'[[]=]====]=]]]==]===[]' == 14) assert(not c:match'[[]=]====]=]=]==]===[]') -- old bug: optimization of concat with fail removed match-time capture p = m.Cmt(0, function (s) p = s end) * m.P(false) assert(not p:match('alo')) assert(p == 'alo') -- ensure that failed match-time captures are not kept on Lua stack do local t = {__mode = "kv"}; setmetatable(t,t) local c = 0 local function foo (s,i) collectgarbage(); assert(next(t) == "__mode" and next(t, "__mode") == nil) local x = {} t[x] = true c = c + 1 return i, x end local p = m.P{ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P"" } p:match(string.rep('1', 10)) assert(c == 11) end -- Return a match-time capture that returns 'n' captures local function manyCmt (n) return m.Cmt("a", function () local a = {}; for i = 1, n do a[i] = n - i end return true, unpack(a) end) end -- bug in 1.0: failed match-time that used previous match-time results do local x local function aux (...) x = #{...}; return false end local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")} assert(#res == 10 and res[1] == 9 and res[10] == 0) end -- bug in 1.0: problems with math-times returning too many captures if _VERSION >= "Lua 5.2" then local lim = 2^11 - 10 local res = {m.match(manyCmt(lim), "a")} assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) checkerr("too many", m.match, manyCmt(2^15), "a") end p = (m.P(function () return true, "a" end) * 'a' + m.P(function (s, i) return i, "aa", 20 end) * 'b' + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 t = {p:match('abacc')} checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) do print"testing large grammars" local lim = 1000 -- number of rules local t = {} for i = 3, lim do t[i] = m.V(i - 1) -- each rule calls previous one end t[1] = m.V(lim) -- start on last rule t[2] = m.C("alo") -- final rule local P = m.P(t) -- build grammar assert(P:match("alo") == "alo") t[#t + 1] = m.P("x") -- one more rule... checkerr("too many rules", m.P, t) end print "testing UTF-8 ranges" do -- a few typical UTF-8 ranges local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" + m.utfR(0, 0x7f)^1 / "ascii: %0" + m.utfR(0, 0x10ffff) / "other: %0" p = m.Ct(p^0) * -m.P(1) local cyr = "ждюя" local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 local cjk = "专举乸" local ascii = "alo" local last = "\244\143\191\191" -- U+10FFFF local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last t = (p:match(s)) assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and t[3] == "emot: " .. emot and t[4] == "other: —" and t[5] == "cjk: " .. cjk and t[6] == "other: —" and t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and t[9] == nil) -- failing UTF-8 matches and borders assert(not m.match(m.utfR(10, 0x2000), "\9")) assert(not m.match(m.utfR(10, 0x2000), "\226\128\129")) assert(m.match(m.utfR(10, 0x2000), "\10") == 2) assert(m.match(m.utfR(10, 0x2000), "\226\128\128") == 4) end do -- valid and invalid code points local p = m.utfR(0, 0x10ffff)^0 assert(p:match("汉字\128") == #"汉字" + 1) assert(p:match("\244\159\191") == 1) assert(p:match("\244\159\191\191") == 1) assert(p:match("\255") == 1) -- basic errors checkerr("empty range", m.utfR, 1, 0) checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) end do -- back references (fixed width) -- match a byte after a CJK point local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' assert(p:match("ab д 专X x") == "X") -- match a byte after a hebrew point local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) p = m.P(#"ש") * p assert(p:match("שX") == "X") checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) end ------------------------------------------------------------------- -- Tests for 're' module ------------------------------------------------------------------- print"testing 're' module" local re = require "re" local match, compile = re.match, re.compile assert(match("a", ".") == 2) assert(match("a", "''") == 1) assert(match("", " ! . ") == 1) assert(not match("a", " ! . ")) assert(match("abcde", " ( . . ) * ") == 5) assert(match("abbcde", " [a-c] +") == 5) assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7) assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8) assert(match("abbc--", " [a-c] + +") == 5) assert(match("abbc--", " [ac-] +") == 2) assert(match("abbc--", " [-acb] + ") == 7) assert(not match("abbcde", " [b-z] + ")) assert(match("abb\"de", '"abb"["]"de"') == 7) assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) assert(re.match("aaand", "[a]^2") == 3) local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} checkeq(t, {4, 5, 7}) local t = {match("abceefe", "((&&'e' {})? .)*")} checkeq(t, {4, 5, 7}) local t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")} checkeq(t, {4, 5, 7}) local t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")} checkeq(t, {4, 5, 7}) assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5) assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4) assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8) assert(match("abc", "a <- (. a)?") == 4) b = "balanced <- '(' ([^()] / balanced)* ')'" assert(match("(abc)", b)) assert(match("(a(b)((c) (d)))", b)) assert(not match("(a(b ((c) (d)))", b)) b = compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] assert(b == m.P(b)) assert(b:match"((((a))(b)))") local g = [[ S <- "0" B / "1" A / "" -- balanced strings A <- "0" S / "1" A A -- one more 0 B <- "1" S / "0" B B -- one more 1 ]] assert(match("00011011", g) == 9) local g = [[ S <- ("0" B / "1" A)* A <- "0" / "1" A A B <- "1" / "0" B B ]] assert(match("00011011", g) == 9) assert(match("000110110", g) == 9) assert(match("011110110", g) == 3) assert(match("000110010", g) == 1) s = "aaaaaaaaaaaaaaaaaaaaaaaa" assert(match(s, "'a'^3") == 4) assert(match(s, "'a'^0") == 1) assert(match(s, "'a'^+3") == s:len() + 1) assert(not match(s, "'a'^+30")) assert(match(s, "'a'^-30") == s:len() + 1) assert(match(s, "'a'^-5") == 6) for i = 1, s:len() do assert(match(s, string.format("'a'^+%d", i)) >= i + 1) assert(match(s, string.format("'a'^-%d", i)) <= i + 1) assert(match(s, string.format("'a'^%d", i)) == i + 1) end assert(match("01234567890123456789", "[0-9]^3+") == 19) assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123") t = match("0123456789", "{| {.}* |}") checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}) assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101") assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c") assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d") assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6) assert(not match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x")) assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x")) eqcharset(compile"[]]", "]") eqcharset(compile"[][]", m.S"[]") eqcharset(compile"[]-]", m.S"-]") eqcharset(compile"[-]", m.S"-") eqcharset(compile"[az-]", m.S"a-z") eqcharset(compile"[-az]", m.S"a-z") eqcharset(compile"[a-z]", m.R"az") eqcharset(compile"[]['\"]", m.S[[]['"]]) eqcharset(compile"[^]]", any - "]") eqcharset(compile"[^][]", any - m.S"[]") eqcharset(compile"[^]-]", any - m.S"-]") eqcharset(compile"[^]-]", any - m.S"-]") eqcharset(compile"[^-]", any - m.S"-") eqcharset(compile"[^az-]", any - m.S"a-z") eqcharset(compile"[^-az]", any - m.S"a-z") eqcharset(compile"[^a-z]", any - m.R"az") eqcharset(compile"[^]['\"]", any - m.S[[]['"]]) -- tests for comments in 're' e = compile[[ A <- _B -- \t \n %nl .<> <- -> -- _B <- 'x' --]] assert(e:match'xy' == 2) -- tests for 're' with pre-definitions defs = {digits = m.R"09", letters = m.R"az", _=m.P"__"} e = compile("%letters (%letters / %digits)*", defs) assert(e:match"x123" == 5) e = compile("%_", defs) assert(e:match"__" == 3) e = compile([[ S <- A+ A <- %letters+ B B <- %digits+ ]], defs) e = compile("{[0-9]+'.'?[0-9]*} -> sin", math) assert(e:match("2.34") == math.sin(2.34)) e = compile("'pi' -> math", _G) assert(e:match("pi") == math.pi) e = compile("[ ]* 'version' -> _VERSION", _G) assert(e:match(" version") == _VERSION) function eq (_, _, a, b) return a == b end c = re.compile([[ longstring <- '[' {:init: '='* :} '[' close close <- ']' =init ']' / . close ]]) assert(c:match'[==[]]===]]]]==]===[]' == 17) assert(c:match'[[]=]====]=]]]==]===[]' == 14) assert(not c:match'[[]=]====]=]=]==]===[]') c = re.compile" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. " assert(c:match'[==[]]===]]]]==]') assert(c:match'[[]=]====]=][]==]===[]]') assert(not c:match'[[]=]====]=]=]==]===[]') assert(re.find("hi alalo", "{:x:..:} =x") == 4) assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4) assert(not re.find("hi alalo", "{:x:..:} =x", 5)) assert(re.find("hi alalo", "{'al'}", 5) == 6) assert(re.find("hi aloalolo", "{:x:..:} =x") == 8) assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11) -- re.find discards any captures local a,b,c = re.find("alo", "{.}{'o'}") assert(a == 2 and b == 3 and c == nil) local function match (s,p) local i,e = re.find(s,p) if i then return s:sub(i, e) end end assert(match("alo alo", '[a-z]+') == "alo") assert(match("alo alo", '{:x: [a-z]+ :} =x') == nil) assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo") assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo") assert(re.gsub("alo alo", "%w+", ".") == ". .") assert(re.gsub("hi, how are you", "[aeiou]", string.upper) == "hI, hOw ArE yOU") s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]' c = re.compile" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' " assert(re.gsub(s, c, "%2") == 'hi and =]') assert(re.gsub(s, c, "%0") == s) assert(re.gsub('[=[hi]=]', c, "%2") == '=') assert(re.find("", "!.") == 1) assert(re.find("alo", "!.") == 4) function addtag (s, i, t, tag) t.tag = tag; return i, t end c = re.compile([[ doc <- block !. block <- (start {| (block / { [^<]+ })* |} end?) => addtag start <- '<' {:tag: [a-z]+ :} '>' end <- '' ]], {addtag = addtag}) x = c:match[[ hihellobuttotheend]] checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', {'totheend'}}) -- test for folding captures c = re.compile([[ S <- (number (%s+ number)*) ~> add number <- %d+ -> tonumber ]], {tonumber = tonumber, add = function (a,b) return a + b end}) assert(c:match("3 401 50") == 3 + 401 + 50) -- test for accumulator captures c = re.compile([[ S <- number (%s+ number >> add)* number <- %d+ -> tonumber ]], {tonumber = tonumber, add = function (a,b) return a + b end}) assert(c:match("3 401 50") == 3 + 401 + 50) -- tests for look-ahead captures x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} checkeq(x, {"", "alo", ""}) assert(re.match("aloalo", "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}") == "AallooAalloo") -- bug in 0.9 (and older versions), due to captures in look-aheads x = re.compile[[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]] assert(x:match"alo alo" == "+ +") -- valid capture in look-ahead (used inside the look-ahead itself) x = re.compile[[ S <- &({:two: .. :} . =two) {[a-z]+} / . S ]] assert(x:match("hello aloaLo aloalo xuxu") == "aloalo") p = re.compile[[ block <- {| {:ident:space*:} line ((=ident !space line) / &(=ident space) block)* |} line <- {[^%nl]*} %nl space <- '_' -- should be ' ', but '_' is simpler for editors ]] t= p:match[[ 1 __1.1 __1.2 ____1.2.1 ____ 2 __2.1 ]] checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"}, "2", {"2.1", ident = "__"}, ident = ""}) -- nested grammars p = re.compile[[ s <- a b !. b <- ( x <- ('b' x)? ) a <- ( x <- 'a' x? ) ]] assert(p:match'aaabbb') assert(p:match'aaa') assert(not p:match'bbb') assert(not p:match'aaabbba') -- testing groups t = {re.match("abc", "{:S <- {:.:} {S} / '':}")} checkeq(t, {"a", "bc", "b", "c", "c", ""}) t = re.match("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}") checkeq(t, {a="1", b="2", c="4"}) t = re.match("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}") checkeq(t, {a="1", b="2", c="4"}) t = re.match("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}") checkeq(t, {"1", b="2", "4", "5"}) t = re.match("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}") checkeq(t, {"1", "23", "4", "5"}) t = re.match("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}") checkeq(t, {"1", "23", "4", "5"}) -- testing pre-defined names assert(os.setlocale("C") == "C") function eqlpeggsub (p1, p2) local s1 = cs2str(re.compile(p1)) local s2 = string.gsub(allchar, "[^" .. p2 .. "]", "") -- if s1 ~= s2 then print(#s1,#s2) end assert(s1 == s2) end eqlpeggsub("%w", "%w") eqlpeggsub("%a", "%a") eqlpeggsub("%l", "%l") eqlpeggsub("%u", "%u") eqlpeggsub("%p", "%p") eqlpeggsub("%d", "%d") eqlpeggsub("%x", "%x") eqlpeggsub("%s", "%s") eqlpeggsub("%c", "%c") eqlpeggsub("%W", "%W") eqlpeggsub("%A", "%A") eqlpeggsub("%L", "%L") eqlpeggsub("%U", "%U") eqlpeggsub("%P", "%P") eqlpeggsub("%D", "%D") eqlpeggsub("%X", "%X") eqlpeggsub("%S", "%S") eqlpeggsub("%C", "%C") eqlpeggsub("[%w]", "%w") eqlpeggsub("[_%w]", "_%w") eqlpeggsub("[^%w]", "%W") eqlpeggsub("[%W%S]", "%W%S") re.updatelocale() -- testing nested substitutions x string captures p = re.compile[[ text <- {~ item* ~} item <- macro / [^()] / '(' item* ')' arg <- ' '* {~ (!',' item)* ~} args <- '(' arg (',' arg)* ')' macro <- ('apply' args) -> '%1(%2)' / ('add' args) -> '%1 + %2' / ('mul' args) -> '%1 * %2' ]] assert(p:match"add(mul(a,b), apply(f,x))" == "a * b + f(x)") rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] assert(rev:match"0123456789" == "9876543210") -- testing error messages in re local function errmsg (p, err) checkerr(err, re.compile, p) end errmsg('aaaa', "rule 'aaaa'") errmsg('a', 'outside') errmsg('b <- a', 'undefined') errmsg("x <- 'a' x <- 'b'", 'already defined') errmsg("'a' -", "near '-'") print"OK"