gnu-emacs-sources
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

peg.el --- Parsing Expression Grammars in Emacs Lisp


From: Helmut Eller
Subject: peg.el --- Parsing Expression Grammars in Emacs Lisp
Date: Wed, 05 Nov 2008 13:32:28 +0100
User-agent: Gnus/5.11 (Gnus v5.11) Emacs/22.0.50 (gnu/linux)

These days, PEGs seems to be in fashion.  Here's an implementation for
your favorite editor.

Helmut.

;;; peg.el --- Parsing Expression Grammars in Emacs Lisp
;;
;; Copyright 2008  Helmut Eller <address@hidden>.
;;
;; This file is licensed under the terms of the GNU General Public
;; License as distributed with Emacs (press C-h C-c for details).
;;
;;; Commentary:
;;
;; Parsing Expression Grammars (PEG) are a formalism in the spirit of
;; Context Free Grammars (CFG) with some simplifications which makes
;; the implementation of PEGs as top-down parser particularly simple
;; and easy to understand [**].
;;
;; This file implements a macro `peg-parse' which parses the current
;; buffer according to a PEG.  E.g. we can match integers with a PEG
;; like this:
;;
;;  (peg-parse (number   sign digit (* digit))
;;             (sign     (or "+" "-" ""))
;;             (digit    '"09")))
;;
;; In contrast to regexps, PEGs allow us to define recursive rules.  A
;; PEG is a list of rules.  A rule is written as (NAME . PE).
;; E.g. (sign (or "+" "-" "")) is a rule with the name "sign".  For
;; convenience, PE is implicitly wrappend in a and. The syntax for
;; Parsing Expression (PE) is a follows:
;;
;; Description          Lisp            Haskell as in [*]
;; Sequence             (and e1 e2)     e1 e2
;; Prioritized Choice   (or e1 e2)      e1 / e2
;; Not-predicate        (not e)         !e
;; And-predictate       (if e)          &e
;; Any character        (any)           .
;; Literal string       "abc"           'abc'
;; Characater C         (char c)
;; Zero-or-more         (* e)           e*
;; One-or-more          (+ e)           e+
;; Optional             (or e "")       e?
;; Character range      (range a b)     [a-b]
;; Character range(2)   '"ab"           [a-b]
;;
;; `peg-parse' also support parsing actions, i.e. Lisp snippets which
;; are exectued when a PE matches.  This can be used to construct
;; syntax trees or similar tasks.  Actions are written as
;;
;;  (action FORM)          ; evaluate FORM
;;  (VAR... -- FORM...)    ; stack action
;; 
;; Actions don't consume input, but are executed at the point of
;; match.  A "stack action" takes VARs from the "value stack" and
;; pushes the result of evaluating FORMs to that stack.  See
;; `peg-ex-parse-int' for an example.
;;
;; References:
;;
;; [*] Bryan Ford. Parsing Expression Grammars: a Recognition-Based
;; Syntactic Foundation. In POPL'04: Proceedings of the 31st ACM
;; SIGPLAN-SIGACT symposium on Principles of Programming Languages,
;; pages 111-122, New York, NY, USA, 2004. ACM Press.
;;
;; [**] Baker, Henry G. "Pragmatic Parsing in Common Lisp".  ACM Lisp
;; Pointers 4(2), April--June 1991, pp. 3--15.
;;

;;; Code:

(defmacro peg-parse (&rest rules)
  "Match RULES at point.
Return (T STACK) if the match succeed and nil on failure."
  (peg-translate-rules rules))

(defmacro peg-parse-exp (exp)
  "Match the parsing expression EXP at point.
Note: a PE can't \"call\" rules by name."
  (peg-translate-exp exp))

;; A table of the PEG rules.  Used during compilation to resolve
;; references to named rules.
(defvar peg-rules)

;; used at runtime for backtracking.  It's a list ((POS . THUNK)...).
;; Each THUNK is executed at the corresponing POS.  Thunks are
;; executed in a postprocessing step, not during parsing.
(defvar peg-thunks)

;; The basic idea is to translate each rule to a lisp function.
;; The result looks like
;;   (let ((rule1 (lambda () code-for-rule1))
;;         ...
;;         (ruleN (lambda () code-for-ruleN)))
;;     (funcall rule1))
;; 
;; code-for-ruleX returns t if the rule matches and nil otherwise.
;;
(defun peg-translate-rules (rules)
  "Translate the PEG RULES, to a top-down parser."
  (let ((peg-rules (make-hash-table :size 20)))
    (dolist (rule rules)
      (puthash (car rule) 'defer peg-rules))
    (dolist (rule rules)
      (puthash (car rule) (peg-normalize `(and . ,(cdr rule))) peg-rules))
    (peg-check-cycles peg-rules)
    `(let ((peg-thunks '()))
       (let ,(mapcar (lambda (rule)
                       (let ((name (car rule)))
                         `(,name
                           (lambda () 
                             ,(peg-translate-exp (gethash name peg-rules))))))
                     rules)
         (when (funcall ,(car (car rules)))
           (peg-postprocess peg-thunks))))))

(defvar peg-normalizers)
(setq peg-normalizers (make-hash-table :size 2))

;; Internaly we use a regularized syntax, e.g. we only have binary OR
;; nodes.  Regularized nodes are lists of the form (OP ARGS...).
(defun peg-normalize (exp)
  "Return a \"normalized\" form of EXP."
  (cond ((consp exp)
         (let ((fun (gethash (car exp) peg-normalizers)))
           (cond (fun (apply fun (cdr exp)))
                 (t (error "Invalid PEG exp: %S" exp)))))
        ((stringp exp)
         (let ((len (length exp)))
           (cond ((zerop len) '(null))
                 ((= len 1) `(char ,(aref exp 0)))
                 (t `(str ,exp)))))
        ((and (symbolp exp) exp)
         (when (not (gethash exp peg-rules))
           (error "Reference to undefined PEG rule: %S" exp))
         `(call ,exp))))

(defmacro peg-define-normalizer (name args &rest body)
  (declare (indent defun))
  `(puthash ',name (lambda ,args . ,body) peg-normalizers))

(dolist (name '(null fail any action char range str eob))
  (puthash name (lambda (x) (cons name x)) peg-normalizers))

(peg-define-normalizer or (&rest args)
  (cond ((null args) '(fail))
        ((null (cdr args)) (peg-normalize (car args)))
        (t `(or ,(peg-normalize (car args)) 
                ,(peg-normalize `(or . ,(cdr args)))))))

(peg-define-normalizer and (&rest args)
  (cond ((null args) '(null))
        ((null (cdr args)) (peg-normalize (car args)))
        (t `(and ,(peg-normalize (car args)) 
                 ,(peg-normalize `(and . ,(cdr args)))))))

(peg-define-normalizer * (&rest args)
  `(* ,(peg-normalize `(and . ,args))))

(peg-define-normalizer + (&rest args)
  (let ((e (peg-normalize `(and . ,args))))
    `(and ,e (* ,e))))

(peg-define-normalizer opt (&rest args)
  (let ((e (peg-normalize `(and . ,args))))
    `(or ,e (null))))

(peg-define-normalizer \` (form)
  (unless (member '-- form)
    (error "Malformed stack action: %S" form))
  (let ((args (cdr (member '-- (reverse form))))
        (values (cdr (member '-- form))))
    (let ((form `(let ,(mapcar (lambda (var) `(,var (pop peg-stack)))
                               args)
                   (setq peg-stack 
                         (append (list . ,values) peg-stack)))))
      `(action ,form))))

(peg-define-normalizer quote (form)
  (cond ((and (stringp form) (= (length form) 2))
         `(range ,(aref form 0) ,(aref form 1)))
        (t (error "Invalid PEG exp: %S" `(quote ,form)))))

(defvar peg-translators)
(setq peg-translators (make-hash-table :size 20))

;; This is the main translation function.
(defun peg-translate-exp (exp)
  "Return the ELisp code to match the PE EXP."
  (let ((translator (or (gethash (car exp) peg-translators)
                        (error "Unknown PEG operator: %S" (car exp)))))
    (apply translator (cdr exp))))

(defmacro peg-define-translator (name args &rest body)
  (declare (indent defun))
  `(puthash ',name (lambda ,args . ,body) peg-translators))

(peg-define-translator and (e1 e2)
  `(and ,(peg-translate-exp e1)
        ,(peg-translate-exp e2)))

(peg-define-translator or (e1 e2)
  (let ((cp (peg-make-choicepoint)))
    `(,@(peg-save-choicepoint cp)
      (or ,(peg-translate-exp e1)
          (,@(peg-restore-choicepoint cp)
           ,(peg-translate-exp e2))))))

;; Choicepoints are used for backtracking.  At a choicepoint we save
;; enough state, so that we can continue from there if needed.
(defun peg-make-choicepoint ()
  (cons (make-symbol "point") (make-symbol "thunks")))

(defun peg-save-choicepoint (choicepoint)
  `(let ((,(car choicepoint) (point))
         (,(cdr choicepoint) peg-thunks))))

(defun peg-restore-choicepoint (choicepoint)
  `(progn 
     (goto-char ,(car choicepoint))
     (setq peg-thunks ,(cdr choicepoint))))

(peg-define-translator null ()
  `t)

(peg-define-translator any ()
  '(not (eobp)))

(peg-define-translator eob ()
  '(eobp))

(peg-define-translator * (e)
  `(progn (while ,(peg-translate-exp e))
          t))

(peg-define-translator if (e)
  (let ((cp (peg-make-choicepoint)))
    `(,@(peg-save-choicepoint cp)
      (when ,(peg-translate-exp e)
        (,@(peg-restore-choicepoint cp))
        t))))

(peg-define-translator char (c)
  `(when (eq (char-after) ',c)
     (forward-char)
     t))

(peg-define-translator range (from to)
  `(when (and (<= ',from (char-after))
              (<= (char-after) ',to))
     (forward-char)
     t))

(peg-define-translator str (str)
  `(when (looking-at ',(regexp-quote str))
     (goto-char (match-end 0))
     t))

(peg-define-translator call (name)
  (or (gethash name peg-rules) 
      (error "Reference to unkown rule: %S" name))
  `(funcall ,name))

(peg-define-translator action (form)
  `(progn
     (push (cons (point) (lambda () ,form)) peg-thunks)
     t))

(defvar peg-stack)
(defun peg-postprocess (thunks)
  "Execute \"actions\"."
  (let  ((peg-stack '()))
    (dolist (thunk (reverse thunks))
      (goto-char (car thunk))
      (funcall (cdr thunk)))
    (list t peg-stack)))

;; Left recursion is presumably a common mistate when using PEGs.
;; Here we try to detect such mistakes.  Essentailly we traverse the
;; graph as long as we can without consuming input.  When we find a
;; recursive call we signal an error.

(defun peg-check-cycles (peg-rules)
  (maphash (lambda (name exp)
             (peg-detect-cycles exp (list name)))
           peg-rules))

(defvar peg-detect-cycles-methods)
(setq peg-detect-cycles-methods (make-hash-table :size 20))

(defun peg-detect-cycles (exp path)
  "Signal an error on a cycle.
Otherwise traverse EXP recursively and return T if EXP can match
without consuming input.  Return nil if EXP definetly consumes
input.  PATH is the list of rules that we have visited so far."
  (apply (or (gethash (car exp) peg-detect-cycles-methods) 
             (error "No detect-cycle method for: %S" exp))
         path (cdr exp)))

(defmacro peg-define-detect-cycles (name args &rest body)
  (declare (indent defun))
  `(puthash ',name (lambda ,args . ,body) peg-detect-cycles-methods))

(peg-define-detect-cycles call (path name)
  (cond ((member name path)
         (error "Possible left recursion: %s"
                (mapconcat #'symbol-name (reverse (cons name path)) " -> ")))
        (t
         (peg-detect-cycles (gethash name peg-rules) (cons name path)))))

(peg-define-detect-cycles and (path &rest args)
  (let ((nullable t))
    (while (and nullable args)
      (setq nullable (peg-detect-cycles (pop args) path)))
    nullable))

(peg-define-detect-cycles or (path &rest args)
  (let ((nullable nil))
    (while (and (not nullable) args)
      (setq nullable (peg-detect-cycles (pop args) path)))
    nullable))

(peg-define-detect-cycles *   (path e) (peg-unary-nullable e path))
(peg-define-detect-cycles if  (path e) (peg-unary-nullable e path))
(peg-define-detect-cycles not (path e) (peg-unary-nullable e path))

(defun peg-unary-nullable (exp path)
  (peg-detect-cycles exp path)
  t)

(peg-define-detect-cycles any   (path)       nil)
(peg-define-detect-cycles char  (path c)     nil)
(peg-define-detect-cycles range (path c1 c2) nil)
(peg-define-detect-cycles str   (path s)     (equal s ""))
(peg-define-detect-cycles null  (path)       t)
(peg-define-detect-cycles eob   (path)       t)
(peg-define-detect-cycles action (path form) t)

;;; Helpers

;; Similar to (CL's) position.
(defun peg-position (elt list)
  "Return the index of ELT in LIST.
Compare with `equal'."
  (let ((i 0)
        (result nil))
    (while (and list (not result))
      (cond ((equal elt (pop list))
             (setq result i))
            (t (setq i (1+ i)))))
    result))

;;; Examples:

;; peg-ex-recognize-int recognizes integers.  An integer begins with a
;; optional sign, then follows one or more digits.  Digits are all
;; characters in from 0 to 9.
;;
;; Notes: 
;; 1) "" matches the empty sequence, i.e. matches without
;; consuming input.  
;; 2) '"09" is the character range from 0 to 9.  This can also be
;; written as (range ?0 ?9).  Traditionally this would be written as
;; [0-9].
(defun peg-ex-recognize-int ()
  (peg-parse (number   sign digit (* digit))
             (sign     (or "+" "-" ""))
             (digit    '"09")))

;; peg-ex-parse-int recognizes integers and computes the corresponding
;; value.  The grammer is the same as for `peg-ex-recognize-int' added
;; with parsing actions.  Unfortunaletly, the actions add quite a bit
;; of clutter.
;;
;; The action for the sign rule pushes t on the stack for a minus sign
;; and nil for plus or no sign.
;;
;; The action for the digit rule pushes the value for a single digit.
;;
;; The action `(a b -- (+ (* a 10) b)), takes two items from the stack
;; and pushes the first digit times 10 added to second digit.
;;
;; The action `(sign val -- (if sign (- val) val)), negates the
;; value if the sign flag is true.
(defun peg-ex-parse-int ()
  (peg-parse (number sign
                     digit
                     (* digit `(a b -- (+ (* a 10) b)))
                     `(sign val -- (if sign (- val) val)))
             (sign (or (and "+" `(-- nil))
                       (and "-" `(-- t))
                       (and "" `(-- nil))))
             (digit '"09" `(-- (- (char-before) ?0)))))

;; Put point after the ) and press C-x C-e
;; (peg-ex-parse-int)-234234

;; We try to detect left recursions.
(defun peg-ex-left-recursion ()
  (eval '(peg-parse (exp (or term
                             (and exp "+" exp)))
                    (term (or digit
                              (and term "*" term)))
                    (digit '"09"))))

(provide 'peg)

;;; peg.el ends here


reply via email to

[Prev in Thread] Current Thread [Next in Thread]