[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 16/18] segment: New library for low-level phase of lexical syntax
From: |
Ben Pfaff |
Subject: |
[PATCH 16/18] segment: New library for low-level phase of lexical syntax analysis. |
Date: |
Sat, 19 Mar 2011 17:10:02 -0700 |
This library provides for a low-level part of lexical analysis for
PSPP syntax, which I call "segmentation". Segmentation accepts a
stream of UTF-8 bytes as input. It outputs a label (a segment type)
for each byte or contiguous sequence of bytes in the input.
The following commit will implement the high-level phase of lexical
analysis, called "scanning", that converts a sequence of segments into
PSPP tokens.
---
Smake | 1 +
src/language/lexer/automake.mk | 2 +
src/language/lexer/segment.c | 1631 +++++++++++++++++++++++++++++++++++
src/language/lexer/segment.h | 122 +++
src/libpspp/automake.mk | 2 +
src/libpspp/prompt.c | 42 +
src/libpspp/prompt.h | 32 +
tests/automake.mk | 16 +
tests/language/lexer/segment-test.c | 318 +++++++
tests/language/lexer/segment.at | 1070 +++++++++++++++++++++++
10 files changed, 3236 insertions(+), 0 deletions(-)
create mode 100644 src/language/lexer/segment.c
create mode 100644 src/language/lexer/segment.h
create mode 100644 src/libpspp/prompt.c
create mode 100644 src/libpspp/prompt.h
create mode 100644 tests/language/lexer/segment-test.c
create mode 100644 tests/language/lexer/segment.at
diff --git a/Smake b/Smake
index 3a3235c..14c2a75 100644
--- a/Smake
+++ b/Smake
@@ -49,6 +49,7 @@ GNULIB_MODULES = \
printf-posix \
printf-safe \
progname \
+ read-file \
regex \
relocatable-prog \
rename \
diff --git a/src/language/lexer/automake.mk b/src/language/lexer/automake.mk
index 71f6b41..b3d06fe 100644
--- a/src/language/lexer/automake.mk
+++ b/src/language/lexer/automake.mk
@@ -10,6 +10,8 @@ language_lexer_sources = \
src/language/lexer/subcommand-list.h \
src/language/lexer/format-parser.c \
src/language/lexer/format-parser.h \
+ src/language/lexer/segment.c \
+ src/language/lexer/segment.h \
src/language/lexer/value-parser.c \
src/language/lexer/value-parser.h \
src/language/lexer/variable-parser.c \
diff --git a/src/language/lexer/segment.c b/src/language/lexer/segment.c
new file mode 100644
index 0000000..9900cd7
--- /dev/null
+++ b/src/language/lexer/segment.c
@@ -0,0 +1,1631 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/lexer/segment.h"
+
+#include <limits.h>
+#include <unistr.h>
+
+#include "data/identifier.h"
+#include "language/lexer/command-name.h"
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+
+#include "gl/c-ctype.h"
+#include "gl/c-strcase.h"
+
+enum segmenter_state
+ {
+ S_SHBANG,
+ S_GENERAL,
+ S_COMMENT_1,
+ S_COMMENT_2,
+ S_DOCUMENT_1,
+ S_DOCUMENT_2,
+ S_DOCUMENT_3,
+ S_FILE_LABEL,
+ S_DO_REPEAT_1,
+ S_DO_REPEAT_2,
+ S_DO_REPEAT_3,
+ S_BEGIN_DATA_1,
+ S_BEGIN_DATA_2,
+ S_BEGIN_DATA_3,
+ S_BEGIN_DATA_4,
+ S_TITLE_1,
+ S_TITLE_2
+ };
+
+#define SS_START_OF_LINE (1u << 0)
+#define SS_START_OF_COMMAND (1u << 1)
+
+static int segmenter_detect_command_name__ (const char *input,
+ size_t n, int ofs);
+
+static int
+segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n)
+{
+ const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
+ int mblen;
+
+ assert (n > 0);
+
+ mblen = u8_mbtoucr (puc, input, n);
+ return (mblen >= 0 ? mblen
+ : mblen == -2 ? -1
+ : u8_mbtouc (puc, input, n));
+}
+
+static int
+segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
+ enum segment_type *type)
+{
+ if (input[0] == '#')
+ {
+ if (n < 2)
+ return -1;
+ else if (input[1] == '!')
+ {
+ int ofs;
+
+ for (ofs = 2; ofs < n; ofs++)
+ if (input[ofs] == '\n')
+ {
+ if (input[ofs - 1] == '\r')
+ ofs--;
+
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_COMMAND;
+ *type = SEG_SHBANG;
+ return ofs;
+ }
+
+ return -1;
+ }
+ }
+
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
+ return segmenter_push (s, input, n, type);
+}
+
+static int
+segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ assert (s->state == S_GENERAL);
+
+ if (n < 2)
+ return -1;
+
+ *type = SEG_PUNCT;
+ s->substate = 0;
+ return input[1] != '\0' && strchr (seconds, input[1]) != NULL ? 2 : 1;
+}
+
+static int
+skip_comment (const char *input, size_t n, size_t ofs)
+{
+ for (; ofs < n; ofs++)
+ {
+ if (input[ofs] == '\n')
+ return ofs;
+ else if (input[ofs] == '*')
+ {
+ if (ofs + 1 >= n)
+ return -1;
+ else if (input[ofs + 1] == '/')
+ return ofs + 2;
+ }
+ }
+ return -1;
+}
+
+static int
+skip_spaces_and_comments (const char *input, size_t n, int ofs)
+{
+ while (ofs < n)
+ {
+ ucs4_t uc;
+ int mblen;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+
+ if (uc == '/')
+ {
+ if (ofs + 1 >= n)
+ return -1;
+ else if (input[ofs + 1] != '*')
+ return ofs;
+
+ ofs = skip_comment (input, n, ofs + 2);
+ if (ofs < 0)
+ return -1;
+ }
+ else if (lex_uc_is_space (uc) && uc != '\n')
+ ofs += mblen;
+ else
+ return ofs;
+ }
+
+ return -1;
+}
+
+static int
+is_end_of_line (const char *input, size_t n, int ofs)
+{
+ if (input[ofs] == '\n')
+ return 1;
+ else if (input[ofs] == '\r')
+ {
+ if (ofs + 1 >= n)
+ return -1;
+ return input[ofs + 1] == '\n';
+ }
+ else
+ return 0;
+}
+
+static int
+at_end_of_line (const char *input, size_t n, int ofs)
+{
+ ofs = skip_spaces_and_comments (input, n, ofs);
+ if (ofs < 0)
+ return -1;
+
+ return is_end_of_line (input, n, ofs);
+}
+
+
+static int
+segmenter_parse_newline__ (const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs;
+
+ if (input[0] == '\n')
+ ofs = 1;
+ else
+ {
+ if (n < 2)
+ return -1;
+
+ assert (input[0] == '\r');
+ assert (input[1] == '\n');
+ ofs = 2;
+ }
+
+ *type = SEG_NEWLINE;
+ return ofs;
+}
+
+static int
+skip_spaces (const char *input, size_t n, size_t ofs)
+{
+ while (ofs < n)
+ {
+ ucs4_t uc;
+ int mblen;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+
+ if (!lex_uc_is_space (uc) || uc == '\n')
+ return ofs;
+
+ ofs += mblen;
+ }
+
+ return -1;
+}
+
+static int
+skip_digits (const char *input, size_t n, int ofs)
+{
+ for (; ofs < n; ofs++)
+ if (!c_isdigit (input[ofs]))
+ return ofs;
+ return -1;
+}
+
+static int
+segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs;
+
+ assert (s->state == S_GENERAL);
+
+ ofs = skip_digits (input, n, 0);
+ if (ofs < 0)
+ return -1;
+
+ if (input[ofs] == '.')
+ {
+ ofs = skip_digits (input, n, ofs + 1);
+ if (ofs < 0)
+ return -1;
+ }
+
+ if (ofs >= n)
+ return -1;
+ if (input[ofs] == 'e' || input[ofs] == 'E')
+ {
+ ofs++;
+ if (ofs >= n)
+ return -1;
+
+ if (input[ofs] == '+' || input[ofs] == '-')
+ {
+ ofs++;
+ if (ofs >= n)
+ return -1;
+ }
+
+ if (!c_isdigit (input[ofs]))
+ {
+ *type = SEG_EXPECTED_EXPONENT;
+ s->substate = 0;
+ return ofs;
+ }
+
+ ofs = skip_digits (input, n, ofs);
+ if (ofs < 0)
+ return -1;
+ }
+
+ if (input[ofs - 1] == '.')
+ {
+ int eol = at_end_of_line (input, n, ofs);
+ if (eol < 0)
+ return -1;
+ else if (eol)
+ ofs--;
+ }
+
+ *type = SEG_NUMBER;
+ s->substate = 0;
+ return ofs;
+}
+
+static bool
+is_reserved_word (const char *s, int n)
+{
+ char s0, s1, s2, s3;
+
+ s0 = c_toupper (s[0]);
+ switch (n)
+ {
+ case 2:
+ s1 = c_toupper (s[1]);
+ return ((s0 == 'B' && s1 == 'Y')
+ || (s0 == 'E' && s1 == 'Q')
+ || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
+ || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
+ || (s0 == 'N' && s1 == 'E')
+ || (s0 == 'O' && s1 == 'R')
+ || (s0 == 'T' && s1 == 'O'));
+
+ case 3:
+ s1 = c_toupper (s[1]);
+ s2 = c_toupper (s[2]);
+ return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
+ || (s1 == 'N' && s2 == 'D')))
+ || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
+
+ case 4:
+ s1 = c_toupper (s[1]);
+ s2 = c_toupper (s[2]);
+ s3 = c_toupper (s[3]);
+ return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
+
+ default:
+ return false;
+ }
+}
+
+static int
+segmenter_parse_comment_1__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int endcmd;
+ int ofs;
+
+ endcmd = -2;
+ ofs = 0;
+ while (ofs < n)
+ {
+ ucs4_t uc;
+ int mblen;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+
+ switch (uc)
+ {
+ case '.':
+ endcmd = ofs;
+ break;
+
+ case '\n':
+ if (ofs > 1 && input[ofs - 1] == '\r')
+ ofs--;
+
+ if (endcmd == -2)
+ {
+ /* Blank line ends comment command. */
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_COMMAND;
+ *type = SEG_SEPARATE_COMMANDS;
+ return ofs;
+ }
+ else if (endcmd >= 0)
+ {
+ /* '.' at end of line ends comment command. */
+ s->state = S_GENERAL;
+ s->substate = 0;
+ *type = SEG_COMMENT_COMMAND;
+ return endcmd;
+ }
+ else
+ {
+ /* Comment continues onto next line. */
+ *type = SEG_COMMENT_COMMAND;
+ s->state = S_COMMENT_2;
+ return ofs;
+ }
+ NOT_REACHED ();
+
+ default:
+ if (!lex_uc_is_space (uc))
+ endcmd = -1;
+ break;
+ }
+
+ ofs += mblen;
+ }
+ return -1;
+}
+
+static int
+segmenter_parse_comment_2__ (struct segmenter *s, const char *input, size_t n,
+ enum segment_type *type)
+{
+ int new_cmd;
+ ucs4_t uc;
+ int mblen;
+ int ofs;
+
+ ofs = segmenter_parse_newline__ (input, n, type);
+ if (ofs < 0 || ofs >= n)
+ return -1;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+
+ if (uc == '+' || uc == '-' || uc == '.')
+ new_cmd = 1;
+ else if (!lex_uc_is_space (uc))
+ switch (s->mode)
+ {
+ case SEG_MODE_INTERACTIVE:
+ new_cmd = false;
+ break;
+
+ case SEG_MODE_BATCH:
+ new_cmd = true;
+ break;
+
+ case SEG_MODE_AUTO:
+ new_cmd = segmenter_detect_command_name__ (input, n, ofs);
+ if (new_cmd < 0)
+ return -1;
+ break;
+ }
+
+ if (new_cmd)
+ {
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
+ }
+ else
+ s->state = S_COMMENT_1;
+ return ofs;
+}
+
+static int
+segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
+ enum segment_type *type)
+{
+ bool end_cmd;
+ int ofs;
+
+ end_cmd = false;
+ ofs = 0;
+ while (ofs < n)
+ {
+ ucs4_t uc;
+ int mblen;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+
+ switch (uc)
+ {
+ case '.':
+ end_cmd = true;
+ break;
+
+ case '\n':
+ if (ofs > 1 && input[ofs - 1] == '\r')
+ ofs--;
+
+ *type = SEG_DOCUMENT;
+ s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
+ return ofs;
+
+ default:
+ if (!lex_uc_is_space (uc))
+ end_cmd = false;
+ break;
+ }
+
+ ofs += mblen;
+ }
+ return -1;
+}
+
+static int
+segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs;
+
+ ofs = segmenter_parse_newline__ (input, n, type);
+ if (ofs < 0)
+ return -1;
+
+ s->state = S_DOCUMENT_1;
+ return ofs;
+}
+
+static int
+segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
+{
+ *type = SEG_END_COMMAND;
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
+ return 0;
+}
+
+static int
+segmenter_unquoted (const char *input, size_t n, int ofs)
+
+{
+ char c;
+
+ ofs = skip_spaces_and_comments (input, n, ofs);
+ if (ofs < 0)
+ return -1;
+
+ c = input[ofs];
+ return c != '\'' && c != '"' && c != '\n' && c != '\0';
+}
+
+static int
+next_id_in_command (const struct segmenter *s, const char *input, size_t n,
+ int ofs, char id[], size_t id_size)
+{
+ struct segmenter sub;
+
+ assert (id_size > 0);
+
+ sub.mode = s->mode;
+ sub.state = S_GENERAL;
+ sub.substate = 0;
+ for (;;)
+ {
+ enum segment_type type;
+ int retval;
+
+ retval = segmenter_push (&sub, input + ofs, n - ofs, &type);
+ if (retval < 0)
+ {
+ id[0] = '\0';
+ return -1;
+ }
+
+ switch (type)
+ {
+ case SEG_SHBANG:
+ case SEG_SPACES:
+ case SEG_COMMENT:
+ case SEG_NEWLINE:
+ break;
+
+ case SEG_IDENTIFIER:
+ if (retval < id_size)
+ {
+ memcpy (id, input + ofs, retval);
+ id[retval] = '\0';
+ return ofs + retval;
+ }
+ /* fall through */
+
+ case SEG_NUMBER:
+ case SEG_QUOTED_STRING:
+ case SEG_HEX_STRING:
+ case SEG_UNICODE_STRING:
+ case SEG_UNQUOTED_STRING:
+ case SEG_RESERVED_WORD:
+ case SEG_PUNCT:
+ case SEG_COMMENT_COMMAND:
+ case SEG_DO_REPEAT_COMMAND:
+ case SEG_INLINE_DATA:
+ case SEG_START_DOCUMENT:
+ case SEG_DOCUMENT:
+ case SEG_START_COMMAND:
+ case SEG_SEPARATE_COMMANDS:
+ case SEG_END_COMMAND:
+ case SEG_END:
+ case SEG_EXPECTED_QUOTE:
+ case SEG_EXPECTED_EXPONENT:
+ case SEG_UNEXPECTED_DOT:
+ case SEG_UNEXPECTED_CHAR:
+ id[0] = '\0';
+ return ofs + retval;
+
+ case SEG_N_TYPES:
+ NOT_REACHED ();
+ }
+ ofs += retval;
+ }
+}
+
+static int
+segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
+ enum segment_type *type)
+{
+ ucs4_t uc;
+ int ofs;
+
+ assert (s->state == S_GENERAL);
+
+ ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
+ for (;;)
+ {
+ int mblen;
+
+ if (ofs >= n)
+ return -1;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+ else if (!lex_uc_is_idn (uc))
+ break;
+
+ ofs += mblen;
+ }
+
+ if (input[ofs - 1] == '.')
+ {
+ int eol = at_end_of_line (input, n, ofs);
+ if (eol < 0)
+ return -1;
+ else if (eol)
+ ofs--;
+ }
+
+ if (is_reserved_word (input, ofs))
+ *type = SEG_RESERVED_WORD;
+ else
+ *type = SEG_IDENTIFIER;
+
+ if (s->substate & SS_START_OF_COMMAND)
+ {
+ struct substring word = ss_buffer (input, ofs);
+
+ if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
+ {
+ s->state = S_COMMENT_1;
+ return segmenter_parse_comment_1__ (s, input, n, type);
+ }
+ else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
+ {
+ s->state = S_DOCUMENT_1;
+ *type = SEG_START_DOCUMENT;
+ return 0;
+ }
+ else if (lex_id_match (ss_cstr ("TITLE"), word)
+ || lex_id_match (ss_cstr ("SUBTITLE"), word))
+ {
+ int result = segmenter_unquoted (input, n, ofs);
+ if (result < 0)
+ return -1;
+ else if (result)
+ {
+ s->state = S_TITLE_1;
+ return ofs;
+ }
+ }
+ else if (lex_id_match (ss_cstr ("FILE"), word))
+ {
+ char id[16];
+
+ if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
+ return -1;
+ else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
+ {
+ s->state = S_FILE_LABEL;
+ s->substate = 0;
+ return ofs;
+ }
+ }
+ else if (lex_id_match (ss_cstr ("DO"), word))
+ {
+ char id[16];
+
+ if (next_id_in_command (s, input, n, ofs, id, sizeof id) < 0)
+ return -1;
+ else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
+ {
+ s->state = S_DO_REPEAT_1;
+ s->substate = 0;
+ return ofs;
+ }
+ }
+ else if (lex_id_match (ss_cstr ("BEGIN"), word))
+ {
+ char id[16];
+ int ofs2;
+
+ ofs2 = next_id_in_command (s, input, n, ofs, id, sizeof id);
+ if (ofs2 < 0)
+ return -1;
+ else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
+ {
+ int eol;
+
+ ofs2 = skip_spaces_and_comments (input, n, ofs2);
+ if (ofs2 < 0)
+ return -1;
+
+ if (input[ofs2] == '.')
+ {
+ ofs2 = skip_spaces_and_comments (input, n, ofs2 + 1);
+ if (ofs2 < 0)
+ return -1;
+ }
+
+ eol = is_end_of_line (input, n, ofs2);
+ if (eol < 0)
+ return -1;
+ else if (eol)
+ {
+ if (memchr (input, '\n', ofs2))
+ s->state = S_BEGIN_DATA_1;
+ else
+ s->state = S_BEGIN_DATA_2;
+ s->substate = 0;
+ return ofs;
+ }
+ }
+ }
+ }
+
+ s->substate = 0;
+ return ofs;
+}
+
+static int
+segmenter_parse_string__ (enum segment_type string_type,
+ int ofs, struct segmenter *s,
+ const char *input, size_t n, enum segment_type *type)
+{
+ int quote = input[ofs];
+
+ ofs++;
+ while (ofs < n)
+ if (input[ofs] == quote)
+ {
+ ofs++;
+ if (ofs >= n)
+ return -1;
+ else if (input[ofs] == quote)
+ ofs++;
+ else
+ {
+ *type = string_type;
+ s->substate = 0;
+ return ofs;
+ }
+ }
+ else if (input[ofs] == '\n' || input[ofs] == '\0')
+ {
+ *type = SEG_EXPECTED_QUOTE;
+ s->substate = 0;
+ return ofs;
+ }
+ else
+ ofs++;
+
+ return -1;
+}
+
+static int
+segmenter_maybe_parse_string__ (enum segment_type string_type,
+ struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ if (n < 2)
+ return -1;
+ else if (input[1] == '\'' || input[1] == '"')
+ return segmenter_parse_string__ (string_type, 1, s, input, n, type);
+ else
+ return segmenter_parse_id__ (s, input, n, type);
+}
+
+static int
+segmenter_parse_mid_command__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ ucs4_t uc;
+ int mblen;
+ int ofs;
+
+ assert (s->state == S_GENERAL);
+ assert (!(s->substate & SS_START_OF_LINE));
+
+ mblen = segmenter_u8_to_uc__ (&uc, input, n);
+ if (mblen < 0)
+ return -1;
+
+ switch (uc)
+ {
+ case '\n':
+ s->substate |= SS_START_OF_LINE;
+ *type = SEG_NEWLINE;
+ return 1;
+
+ case '/':
+ if (n == 1)
+ return -1;
+ else if (input[1] == '*')
+ {
+ ofs = skip_comment (input, n, 2);
+ if (ofs < 0)
+ return -1;
+
+ *type = SEG_COMMENT;
+ return ofs;
+ }
+ else
+ {
+ s->substate = 0;
+ *type = SEG_PUNCT;
+ return 1;
+ }
+
+ case '(': case ')': case ',': case '=': case '-':
+ case '[': case ']': case '&': case '|': case '+':
+ *type = SEG_PUNCT;
+ s->substate = 0;
+ return 1;
+
+ case '*':
+ if (s->substate & SS_START_OF_COMMAND)
+ {
+ /* '*' at the beginning of a command begins a comment. */
+ s->state = S_COMMENT_1;
+ return segmenter_parse_comment_1__ (s, input, n, type);
+ }
+ else
+ return segmenter_parse_digraph__ ("*", s, input, n, type);
+
+ case '<':
+ return segmenter_parse_digraph__ ("=>", s, input, n, type);
+
+ case '>':
+ return segmenter_parse_digraph__ ("=", s, input, n, type);
+
+ case '~':
+ return segmenter_parse_digraph__ ("=", s, input, n, type);
+
+ case '.':
+ if (n < 2)
+ return -1;
+ else if (c_isdigit (input[1]))
+ return segmenter_parse_number__ (s, input, n, type);
+ else
+ {
+ int eol = at_end_of_line (input, n, 1);
+ if (eol < 0)
+ return -1;
+
+ if (eol)
+ {
+ *type = SEG_END_COMMAND;
+ s->substate = SS_START_OF_COMMAND;
+ }
+ else
+ *type = SEG_UNEXPECTED_DOT;
+ return 1;
+ }
+ NOT_REACHED ();
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ return segmenter_parse_number__ (s, input, n, type);
+
+ case 'u': case 'U':
+ return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
+ s, input, n, type);
+
+ case 'x': case 'X':
+ return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
+ s, input, n, type);
+
+ case '\'': case '"':
+ return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
+ s, input, n, type);
+
+ default:
+ if (lex_uc_is_space (uc))
+ {
+ ofs = skip_spaces (input, n, mblen);
+ if (ofs < 0)
+ return -1;
+
+ if (input[ofs - 1] == '\r' && input[ofs] == '\n')
+ {
+ if (ofs == 1)
+ {
+ s->substate |= SS_START_OF_LINE;
+ *type = SEG_NEWLINE;
+ return 2;
+ }
+ else
+ ofs--;
+ }
+ *type = SEG_SPACES;
+ return ofs;
+ }
+ else if (lex_uc_is_id1 (uc))
+ return segmenter_parse_id__ (s, input, n, type);
+ else
+ {
+ *type = SEG_UNEXPECTED_CHAR;
+ s->substate = 0;
+ return mblen;
+ }
+ }
+}
+
+static int
+compare_commands (const void *a_, const void *b_)
+{
+ const char *const *ap = a_;
+ const char *const *bp = b_;
+ const char *a = *ap;
+ const char *b = *bp;
+
+ return c_strcasecmp (a, b);
+}
+
+static const char **
+segmenter_get_command_name_candidates (unsigned char first)
+{
+#define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
+#define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
+ static const char *commands[] =
+ {
+#include "language/command.def"
+ ""
+ };
+ static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
+#undef DEF_CMD
+#undef UNIMPL_CMD
+
+ static bool inited;
+
+ static const char **cindex[UCHAR_MAX + 1];
+
+ if (!inited)
+ {
+ size_t i;
+
+ inited = true;
+
+ qsort (commands, n_commands, sizeof *commands, compare_commands);
+ for (i = 0; i < n_commands; i++)
+ {
+ unsigned char c = c_toupper (commands[i][0]);
+ if (cindex[c] == NULL)
+ cindex[c] = &commands[i];
+ }
+ for (i = 0; i <= UCHAR_MAX; i++)
+ if (cindex[i] == NULL)
+ cindex[i] = &commands[n_commands];
+ }
+
+ return cindex[c_toupper (first)];
+}
+
+static int
+segmenter_detect_command_name__ (const char *input, size_t n, int ofs)
+{
+ const char **commands;
+
+ input += ofs;
+ n -= ofs;
+ ofs = 0;
+ for (;;)
+ {
+ ucs4_t uc;
+ int mblen;
+
+ if (ofs >= n)
+ return -1;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+
+ if (uc == '\n'
+ || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
+ break;
+
+ ofs += mblen;
+ }
+ if (input[ofs - 1] == '.')
+ ofs--;
+
+ for (commands = segmenter_get_command_name_candidates (input[0]);
+ c_toupper (input[0]) == c_toupper ((*commands)[0]);
+ commands++)
+ {
+ int missing_words;
+ bool exact;
+
+ if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
+ &exact, &missing_words)
+ && missing_words <= 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int
+is_start_of_string__ (const char *input, size_t n, int ofs)
+{
+ int c;
+
+ c = input[ofs];
+ if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
+ {
+ if (ofs + 1 >= n)
+ return -1;
+
+ return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
+ }
+ else
+ return c == '\'' || c == '"' || c == '\n';
+}
+
+static int
+segmenter_parse_start_of_line__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ ucs4_t uc;
+ int mblen;
+ int ofs;
+
+ assert (s->state == S_GENERAL);
+ assert (s->substate & SS_START_OF_LINE);
+
+ mblen = segmenter_u8_to_uc__ (&uc, input, n);
+ if (mblen < 0)
+ return -1;
+
+ switch (uc)
+ {
+ case '+':
+ ofs = skip_spaces_and_comments (input, n, 1);
+ if (ofs < 0)
+ return -1;
+ else
+ {
+ int is_string = is_start_of_string__ (input, n, ofs);
+ if (is_string < 0)
+ return -1;
+ else if (is_string)
+ {
+ /* This is punctuation that may separate pieces of a string. */
+ *type = SEG_PUNCT;
+ s->substate = 0;
+ return 1;
+ }
+ }
+ /* Fall through. */
+
+ case '-':
+ case '.':
+ *type = SEG_START_COMMAND;
+ s->substate = SS_START_OF_COMMAND;
+ return 1;
+
+ default:
+ if (lex_uc_is_space (uc))
+ {
+ int eol = at_end_of_line (input, n, 0);
+ if (eol < 0)
+ return -1;
+ else if (eol)
+ {
+ s->substate = SS_START_OF_COMMAND;
+ *type = SEG_SEPARATE_COMMANDS;
+ return 0;
+ }
+ break;
+ }
+
+ if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
+ break;
+ else if (s->mode == SEG_MODE_AUTO)
+ {
+ int cmd = segmenter_detect_command_name__ (input, n, 0);
+ if (cmd < 0)
+ return -1;
+ else if (cmd == 0)
+ break;
+ }
+ else
+ assert (s->mode == SEG_MODE_BATCH);
+
+ s->substate = SS_START_OF_COMMAND;
+ *type = SEG_START_COMMAND;
+ return 0;
+ }
+
+ s->substate = SS_START_OF_COMMAND;
+ return segmenter_parse_mid_command__ (s, input, n, type);
+}
+
+static int
+segmenter_parse_file_label__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ struct segmenter sub;
+ int ofs;
+
+ sub = *s;
+ sub.state = S_GENERAL;
+ ofs = segmenter_push (&sub, input, n, type);
+
+ if (ofs < 0)
+ return -1;
+ else if (*type == SEG_IDENTIFIER)
+ {
+ int result;
+
+ assert (lex_id_match (ss_cstr ("LABEL"),
+ ss_buffer ((char *) input, ofs)));
+ result = segmenter_unquoted (input, n, ofs);
+ if (result < 0)
+ return -1;
+ else
+ {
+ if (result)
+ s->state = S_TITLE_1;
+ else
+ *s = sub;
+ return ofs;
+ }
+ }
+ else
+ {
+ s->substate = sub.substate;
+ return ofs;
+ }
+}
+
+static int
+segmenter_subparse (struct segmenter *s,
+ const char *input, size_t n, enum segment_type *type)
+{
+ struct segmenter sub;
+ int ofs;
+
+ sub.mode = s->mode;
+ sub.state = S_GENERAL;
+ sub.substate = s->substate;
+ ofs = segmenter_push (&sub, input, n, type);
+ s->substate = sub.substate;
+ return ofs;
+}
+
+static int
+segmenter_parse_do_repeat_1__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
+ s->state = S_DO_REPEAT_2;
+ else if (*type == SEG_END_COMMAND)
+ {
+ s->state = S_DO_REPEAT_3;
+ s->substate = 1;
+ }
+
+ return ofs;
+}
+
+static int
+segmenter_parse_do_repeat_2__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_NEWLINE)
+ {
+ s->state = S_DO_REPEAT_3;
+ s->substate = 1;
+ }
+
+ return ofs;
+}
+
+static bool
+check_repeat_command (struct segmenter *s,
+ const char *input, size_t n)
+{
+ int direction;
+ char id[16];
+ int ofs;
+
+ ofs = 0;
+ if (input[ofs] == '+' || input[ofs] == '-')
+ ofs++;
+
+ ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
+ if (ofs < 0)
+ return false;
+ else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
+ direction = 1;
+ else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
+ direction = -1;
+ else
+ return true;
+
+ ofs = next_id_in_command (s, input, n, ofs, id, sizeof id);
+ if (ofs < 0)
+ return false;
+
+ if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
+ s->substate += direction;
+ return true;
+}
+
+static int
+segmenter_parse_full_line__ (const char *input, size_t n,
+ enum segment_type *type)
+{
+ const char *newline = memchr (input, '\n', n);
+
+ if (newline == NULL)
+ return -1;
+ else
+ {
+ int ofs = newline - input;
+ if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
+ {
+ *type = SEG_NEWLINE;
+ return ofs + 1;
+ }
+ else
+ return ofs - (input[ofs - 1] == '\r');
+ }
+}
+
+static int
+segmenter_parse_do_repeat_3__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs;
+
+ ofs = segmenter_parse_full_line__ (input, n, type);
+ if (ofs < 0 || input[ofs - 1] == '\n')
+ return ofs;
+ else if (!check_repeat_command (s, input, n))
+ return -1;
+ else if (s->substate == 0)
+ {
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
+ return segmenter_push (s, input, n, type);
+ }
+ else
+ {
+ *type = SEG_DO_REPEAT_COMMAND;
+ return ofs;
+ }
+}
+
+static int
+segmenter_parse_begin_data_1__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_NEWLINE)
+ s->state = S_BEGIN_DATA_2;
+
+ return ofs;
+}
+
+static int
+segmenter_parse_begin_data_2__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs = segmenter_subparse (s, input, n, type);
+ if (ofs < 0)
+ return -1;
+
+ if (*type == SEG_NEWLINE)
+ s->state = S_BEGIN_DATA_3;
+
+ return ofs;
+}
+
+static bool
+is_end_data (const char *input, size_t n)
+{
+ const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
+ bool endcmd;
+ ucs4_t uc;
+ int mblen;
+ int ofs;
+
+ if (n < 3 || c_strncasecmp (input, "END", 3))
+ return false;
+
+ ofs = 3;
+ mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
+ if (!lex_uc_is_space (uc))
+ return false;
+ ofs += mblen;
+
+ if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
+ return false;
+ ofs += 4;
+
+ endcmd = false;
+ while (ofs < n)
+ {
+ mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
+ if (uc == '.')
+ {
+ if (endcmd)
+ return false;
+ endcmd = true;
+ }
+ else if (!lex_uc_is_space (uc))
+ return false;
+ ofs += mblen;
+ }
+
+ return true;
+}
+
+static int
+segmenter_parse_begin_data_3__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs;
+
+ ofs = segmenter_parse_full_line__ (input, n, type);
+ if (ofs < 0)
+ return -1;
+ else if (is_end_data (input, ofs))
+ {
+ s->state = S_GENERAL;
+ s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
+ return segmenter_push (s, input, n, type);
+ }
+ else
+ {
+ *type = SEG_INLINE_DATA;
+ s->state = S_BEGIN_DATA_4;
+ return input[ofs - 1] == '\n' ? 0 : ofs;
+ }
+}
+
+static int
+segmenter_parse_begin_data_4__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs;
+
+ ofs = segmenter_parse_newline__ (input, n, type);
+ if (ofs < 0)
+ return -1;
+
+ s->state = S_BEGIN_DATA_3;
+ return ofs;
+}
+
+static int
+segmenter_parse_title_1__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int ofs;
+
+ ofs = skip_spaces (input, n, 0);
+ if (ofs < 0)
+ return -1;
+ s->state = S_TITLE_2;
+ *type = SEG_SPACES;
+ return ofs;
+}
+
+static int
+segmenter_parse_title_2__ (struct segmenter *s,
+ const char *input, size_t n,
+ enum segment_type *type)
+{
+ int endcmd;
+ int ofs;
+
+ endcmd = -1;
+ ofs = 0;
+ while (ofs < n)
+ {
+ ucs4_t uc;
+ int mblen;
+
+ mblen = segmenter_u8_to_uc__ (&uc, input + ofs, n - ofs);
+ if (mblen < 0)
+ return -1;
+
+ switch (uc)
+ {
+ case '\n':
+ s->state = S_GENERAL;
+ s->substate = 0;
+ *type = SEG_UNQUOTED_STRING;
+ return endcmd >= 0 ? endcmd : ofs;
+
+ case '.':
+ endcmd = ofs;
+ break;
+
+ default:
+ if (!lex_uc_is_space (uc))
+ endcmd = -1;
+ break;
+ }
+
+ ofs += mblen;
+ }
+
+ return -1;
+}
+
+/* Returns the name of segment TYPE as a string. The caller must not modify
+ or free the returned string.
+
+ This is useful only for debugging and testing. */
+const char *
+segment_type_to_string (enum segment_type type)
+{
+ switch (type)
+ {
+#define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
+ SEG_TYPES
+#undef SEG_TYPE
+ default:
+ return "unknown segment type";
+ }
+}
+
+/* Initializes S as a segmenter with the given syntax MODE.
+
+ A segmenter does not contain any external references, so nothing needs to be
+ done to destroy one. For the same reason, segmenters may be copied with
+ plain struct assignment (or memcpy). */
+void
+segmenter_init (struct segmenter *s, enum segmenter_mode mode)
+{
+ s->state = S_SHBANG;
+ s->substate = 0;
+ s->mode = mode;
+}
+
+/* Returns the mode passed to segmenter_init() for S. */
+enum segmenter_mode
+segmenter_get_mode (const struct segmenter *s)
+{
+ return s->mode;
+}
+
+/* Attempts to label a prefix of S's remaining input with a segment type. The
+ caller supplies the first N bytes of the remaining input as INPUT, which
+ must be a UTF-8 encoded string. The end of the input stream must be
+ indicated by a null byte at the beginning of a line, that is, immediately
+ following a new-line (or as the first byte of the input stream).
+
+ The input may contain '\n' or '\r\n' line ends in any combination.
+
+ If successful, returns the number of bytes in the segment at the beginning
+ of INPUT (between 0 and N, inclusive) and stores the type of that segment
+ into *TYPE. The next call to segmenter_push() should not include those
+ bytes as part of INPUT, because they have (figuratively) been consumed by
+ the segmenter.
+
+ Failure occurs only if the segment type of the N bytes in INPUT cannot yet
+ be determined. In this case segmenter_push() returns -1. The caller should
+ obtain more input and then call segmenter_push() again with a larger N and
+ repeat until the input is exhausted (which must be indicated as described
+ above) or until a valid segment is returned. segmenter_push() will never
+ return -1 when the end of input is visible within INPUT.
+
+ The caller must not, in a sequence of calls, supply contradictory input.
+ That is, bytes provided as part of INPUT in one call, but not consumed, must
+ not be provided with *different* values on subsequent calls. This is
+ because segmenter_push() must often make decisions based on looking ahead
+ beyond the bytes that it consumes. */
+int
+segmenter_push (struct segmenter *s, const char *input, size_t n,
+ enum segment_type *type)
+{
+ if (n == 0)
+ return -1;
+
+ if (input[0] == '\0')
+ {
+ *type = SEG_END;
+ return 1;
+ }
+
+ switch (s->state)
+ {
+ case S_SHBANG:
+ return segmenter_parse_shbang__ (s, input, n, type);
+
+ case S_GENERAL:
+ return (s->substate & SS_START_OF_LINE
+ ? segmenter_parse_start_of_line__ (s, input, n, type)
+ : segmenter_parse_mid_command__ (s, input, n, type));
+
+ case S_COMMENT_1:
+ return segmenter_parse_comment_1__ (s, input, n, type);
+ case S_COMMENT_2:
+ return segmenter_parse_comment_2__ (s, input, n, type);
+
+ case S_DOCUMENT_1:
+ return segmenter_parse_document_1__ (s, input, n, type);
+ case S_DOCUMENT_2:
+ return segmenter_parse_document_2__ (s, input, n, type);
+ case S_DOCUMENT_3:
+ return segmenter_parse_document_3__ (s, type);
+
+ case S_FILE_LABEL:
+ return segmenter_parse_file_label__ (s, input, n, type);
+
+ case S_DO_REPEAT_1:
+ return segmenter_parse_do_repeat_1__ (s, input, n, type);
+ case S_DO_REPEAT_2:
+ return segmenter_parse_do_repeat_2__ (s, input, n, type);
+ case S_DO_REPEAT_3:
+ return segmenter_parse_do_repeat_3__ (s, input, n, type);
+
+ case S_BEGIN_DATA_1:
+ return segmenter_parse_begin_data_1__ (s, input, n, type);
+ case S_BEGIN_DATA_2:
+ return segmenter_parse_begin_data_2__ (s, input, n, type);
+ case S_BEGIN_DATA_3:
+ return segmenter_parse_begin_data_3__ (s, input, n, type);
+ case S_BEGIN_DATA_4:
+ return segmenter_parse_begin_data_4__ (s, input, n, type);
+
+ case S_TITLE_1:
+ return segmenter_parse_title_1__ (s, input, n, type);
+ case S_TITLE_2:
+ return segmenter_parse_title_2__ (s, input, n, type);
+ }
+
+ NOT_REACHED ();
+}
+
+/* Returns the style of command prompt to display to an interactive user for
+ input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
+ and at the beginning of a line (that is, if segmenter_push() consumed as
+ much as possible of the input up to a new-line). */
+enum prompt_style
+segmenter_get_prompt (const struct segmenter *s)
+{
+ switch (s->state)
+ {
+ case S_SHBANG:
+ return PROMPT_FIRST;
+
+ case S_GENERAL:
+ return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
+
+ case S_COMMENT_1:
+ case S_COMMENT_2:
+ return PROMPT_COMMENT;
+
+ case S_DOCUMENT_1:
+ case S_DOCUMENT_2:
+ return PROMPT_DOCUMENT;
+ case S_DOCUMENT_3:
+ return PROMPT_FIRST;
+
+ case S_FILE_LABEL:
+ return PROMPT_LATER;
+
+ case S_DO_REPEAT_1:
+ case S_DO_REPEAT_2:
+ return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
+ case S_DO_REPEAT_3:
+ return PROMPT_DO_REPEAT;
+
+ case S_BEGIN_DATA_1:
+ return PROMPT_FIRST;
+ case S_BEGIN_DATA_2:
+ return PROMPT_LATER;
+ case S_BEGIN_DATA_3:
+ case S_BEGIN_DATA_4:
+ return PROMPT_DATA;
+
+ case S_TITLE_1:
+ case S_TITLE_2:
+ return PROMPT_FIRST;
+ }
+
+ NOT_REACHED ();
+}
diff --git a/src/language/lexer/segment.h b/src/language/lexer/segment.h
new file mode 100644
index 0000000..686b471
--- /dev/null
+++ b/src/language/lexer/segment.h
@@ -0,0 +1,122 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef SEGMENT_H
+#define SEGMENT_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include "libpspp/prompt.h"
+
+/* PSPP syntax segmentation.
+
+ PSPP divides traditional "lexical analysis" or "tokenization" into two
+ phases: a lower-level phase called "segmentation" and a higher-level phase
+ called "scanning". This header file provides declarations for the
+ segmentation phase. scan.h contains declarations for the scanning phase.
+
+ Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label
+ (a segment type) for each byte or contiguous sequence of bytes in the input.
+ It also, in a few corner cases, outputs zero-width segments that label the
+ boundary between a pair of bytes in the input.
+
+ Some segment types correspond directly to tokens; for example, an
+ "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID)
+ later in lexical analysis. Other segments contribute to tokens but do not
+ correspond diectly; for example, multiple quoted string segments
+ (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators
+ (SEG_PUNCT) may be combined to form a single string token (T_STRING).
+ Still other segments are ignored (e.g. SEG_SPACES) or trigger special
+ behavior such as error messages later in tokenization
+ (e.g. SEG_EXPECTED_QUOTE).
+*/
+
+/* Segmentation mode.
+
+ This corresponds to the syntax mode for which a syntax file is intended.
+ This is the only configuration setting for a segmenter. */
+enum segmenter_mode
+ {
+ /* Try to interpret input correctly regardless of whether it is written
+ for interactive or batch mode. */
+ SEG_MODE_AUTO,
+
+ /* Interactive or batch syntax mode. */
+ SEG_MODE_INTERACTIVE,
+ SEG_MODE_BATCH
+ };
+
+#define SEG_TYPES \
+ SEG_TYPE(NUMBER) \
+ SEG_TYPE(QUOTED_STRING) \
+ SEG_TYPE(HEX_STRING) \
+ SEG_TYPE(UNICODE_STRING) \
+ SEG_TYPE(UNQUOTED_STRING) \
+ SEG_TYPE(RESERVED_WORD) \
+ SEG_TYPE(IDENTIFIER) \
+ SEG_TYPE(PUNCT) \
+ \
+ SEG_TYPE(SHBANG) \
+ SEG_TYPE(SPACES) \
+ SEG_TYPE(COMMENT) \
+ SEG_TYPE(NEWLINE) \
+ \
+ SEG_TYPE(COMMENT_COMMAND) \
+ SEG_TYPE(DO_REPEAT_COMMAND) \
+ SEG_TYPE(INLINE_DATA) \
+ \
+ SEG_TYPE(START_DOCUMENT) \
+ SEG_TYPE(DOCUMENT) \
+ \
+ SEG_TYPE(START_COMMAND) \
+ SEG_TYPE(SEPARATE_COMMANDS) \
+ SEG_TYPE(END_COMMAND) \
+ SEG_TYPE(END) \
+ \
+ SEG_TYPE(EXPECTED_QUOTE) \
+ SEG_TYPE(EXPECTED_EXPONENT) \
+ SEG_TYPE(UNEXPECTED_DOT) \
+ SEG_TYPE(UNEXPECTED_CHAR)
+
+/* Types of segments. */
+enum segment_type
+ {
+#define SEG_TYPE(NAME) SEG_##NAME,
+ SEG_TYPES
+#undef SEG_TYPE
+ SEG_N_TYPES
+ };
+
+const char *segment_type_to_string (enum segment_type);
+
+/* A segmenter. Opaque. */
+struct segmenter
+ {
+ unsigned char state;
+ unsigned char substate;
+ unsigned char mode;
+ };
+
+void segmenter_init (struct segmenter *, enum segmenter_mode);
+
+enum segmenter_mode segmenter_get_mode (const struct segmenter *);
+
+int segmenter_push (struct segmenter *, const char *input, size_t n,
+ enum segment_type *);
+
+enum prompt_style segmenter_get_prompt (const struct segmenter *);
+
+#endif /* segment.h */
diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk
index a7c9283..fcb2814 100644
--- a/src/libpspp/automake.mk
+++ b/src/libpspp/automake.mk
@@ -60,6 +60,8 @@ src_libpspp_libpspp_la_SOURCES = \
src/libpspp/msg-locator.h \
src/libpspp/pool.c \
src/libpspp/pool.h \
+ src/libpspp/prompt.c \
+ src/libpspp/prompt.h \
src/libpspp/range-map.c \
src/libpspp/range-map.h \
src/libpspp/range-set.c \
diff --git a/src/libpspp/prompt.c b/src/libpspp/prompt.c
new file mode 100644
index 0000000..0722c3b
--- /dev/null
+++ b/src/libpspp/prompt.c
@@ -0,0 +1,42 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "libpspp/prompt.h"
+
+const char *
+prompt_style_to_string (enum prompt_style style)
+{
+ switch (style)
+ {
+ case PROMPT_FIRST:
+ return "first";
+ case PROMPT_LATER:
+ return "later";
+ case PROMPT_DATA:
+ return "data";
+ case PROMPT_COMMENT:
+ return "COMMENT";
+ case PROMPT_DOCUMENT:
+ return "DOCUMENT";
+ case PROMPT_DO_REPEAT:
+ return "DO REPEAT";
+ default:
+ return "unknown prompt";
+ }
+}
+
diff --git a/src/libpspp/prompt.h b/src/libpspp/prompt.h
new file mode 100644
index 0000000..14e820b
--- /dev/null
+++ b/src/libpspp/prompt.h
@@ -0,0 +1,32 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef PROMPT_H
+#define PROMPT_H 1
+
+enum prompt_style
+ {
+ PROMPT_FIRST, /* First line of command. */
+ PROMPT_LATER, /* Second or later line of command. */
+ PROMPT_DATA, /* Between BEGIN DATA and END DATA. */
+ PROMPT_COMMENT, /* COMMENT or * command. */
+ PROMPT_DOCUMENT, /* DOCUMENT command. */
+ PROMPT_DO_REPEAT /* DO REPEAT command. */
+ };
+
+const char *prompt_style_to_string (enum prompt_style);
+
+#endif /* prompt.h */
diff --git a/tests/automake.mk b/tests/automake.mk
index 0b4a825..4d49e5b 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -5,6 +5,7 @@ check_PROGRAMS += \
tests/data/sack \
tests/data/inexactify \
tests/language/lexer/command-name-test \
+ tests/language/lexer/segment-test \
tests/libpspp/abt-test \
tests/libpspp/bt-test \
tests/libpspp/encoding-guesser-test \
@@ -210,6 +211,20 @@ tests_language_lexer_command_name_test_LDADD = \
$(LIBINTL)
tests_language_lexer_command_name_test_CFLAGS = $(AM_CFLAGS)
+
+check_PROGRAMS += tests/language/lexer/segment-test
+tests_language_lexer_segment_test_SOURCES = \
+ src/data/identifier.c \
+ src/language/lexer/command-name.c \
+ src/language/lexer/segment.c \
+ src/libpspp/pool.c \
+ src/libpspp/prompt.c \
+ src/libpspp/str.c \
+ src/libpspp/temp-file.c \
+ tests/language/lexer/segment-test.c
+tests_language_lexer_segment_test_LDADD = gl/libgl.la $(LIBINTL)
+tests_language_lexer_segment_test_CFLAGS = $(AM_CFLAGS)
+
check_PROGRAMS += tests/output/render-test
tests_output_render_test_SOURCES = tests/output/render-test.c
tests_output_render_test_LDADD = \
@@ -291,6 +306,7 @@ TESTSUITE_AT = \
tests/language/lexer/command-name.at \
tests/language/lexer/lexer.at \
tests/language/lexer/q2c.at \
+ tests/language/lexer/segment.at \
tests/language/lexer/variable-parser.at \
tests/language/stats/aggregate.at \
tests/language/stats/autorecode.at \
diff --git a/tests/language/lexer/segment-test.c
b/tests/language/lexer/segment-test.c
new file mode 100644
index 0000000..64243c8
--- /dev/null
+++ b/tests/language/lexer/segment-test.c
@@ -0,0 +1,318 @@
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistr.h>
+
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+#include "libpspp/compiler.h"
+#include "libpspp/misc.h"
+#include "language/lexer/segment.h"
+
+#include "gl/error.h"
+#include "gl/minmax.h"
+#include "gl/progname.h"
+#include "gl/read-file.h"
+#include "gl/xalloc.h"
+
+/* -a/--auto, -b/--batch, -i/--interactive: syntax mode. */
+static enum segmenter_mode mode = SEG_MODE_AUTO;
+
+/* -v, --verbose: Print row and column information. */
+static bool verbose;
+
+/* -1, --one-byte: Feed in one byte at a time? */
+static bool one_byte;
+
+static const char *parse_options (int argc, char **argv);
+static void usage (void) NO_RETURN;
+
+int
+main (int argc, char *argv[])
+{
+ size_t offset, line_number, line_offset;
+ const char *file_name;
+ char *input;
+ struct segmenter s;
+ size_t length;
+ int prev_type;
+
+ set_program_name (argv[0]);
+ file_name = parse_options (argc, argv);
+
+ /* Read from stdin into 'input'. Ensure that 'input' ends in a new-line
+ followed by a null byte. */
+ input = (!strcmp (file_name, "-")
+ ? fread_file (stdin, &length)
+ : read_file (file_name, &length));
+ if (input == NULL)
+ error (EXIT_FAILURE, errno, "reading %s failed", file_name);
+ input = xrealloc (input, length + 3);
+ if (length == 0 || input[length - 1] != '\n')
+ input[length++] = '\n';
+ input[length++] = '\0';
+
+ segmenter_init (&s, mode);
+
+ line_number = 1;
+ line_offset = 0;
+ prev_type = -1;
+ for (offset = 0; offset < length; )
+ {
+ enum segment_type type;
+ const char *type_name, *p;
+ int n;
+
+ if (one_byte)
+ {
+ int n_newlines = 0;
+ int i;
+
+ for (i = 0; i <= length - offset; i++)
+ {
+ /* Make a copy to ensure that segmenter_push() isn't actually
+ looking ahead. */
+ char *copy;
+
+ if (i > 0 && input[offset + i - 1] == '\n')
+ n_newlines++;
+
+ copy = xmemdup (input + offset, i);
+ n = segmenter_push (&s, copy, i, &type);
+ free (copy);
+
+ if (n >= 0)
+ break;
+ }
+ assert (n_newlines <= 2);
+ }
+ else
+ n = segmenter_push (&s, input + offset, length - offset, &type);
+
+ if (n < 0)
+ error (EXIT_FAILURE, 0, "segmenter_push returned -1 at offset %zu",
+ offset);
+ assert (offset + n <= length);
+
+ if (type == SEG_NEWLINE)
+ assert ((n == 1 && input[offset] == '\n')
+ || (n == 2
+ && input[offset] == '\r' && input[offset + 1] == '\n'));
+ else
+ assert (memchr (&input[offset], '\n', n) == NULL);
+
+ if (!verbose)
+ {
+ if (prev_type != SEG_SPACES && prev_type != -1
+ && type == SEG_SPACES && n == 1 && input[offset] == ' ')
+ {
+ printf (" space\n");
+ offset++;
+ prev_type = -1;
+ continue;
+ }
+ }
+ if (prev_type != -1)
+ putchar ('\n');
+ prev_type = type;
+
+ if (verbose)
+ printf ("%2zu:%2zu: ", line_number, offset - line_offset);
+
+ type_name = segment_type_to_string (type);
+ for (p = type_name; *p != '\0'; p++)
+ putchar (tolower ((unsigned char) *p));
+ if (n > 0)
+ {
+ int i;
+
+ for (i = MIN (15, strlen (type_name)); i < 16; i++)
+ putchar (' ');
+ for (i = 0; i < n; )
+ {
+ const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
+ ucs4_t uc;
+ int mblen;
+
+ mblen = u8_mbtoucr (&uc, u_input + (offset + i), n - i);
+ if (mblen < 0)
+ {
+ int j;
+
+ mblen = u8_mbtouc (&uc, u_input + (offset + i), n - i);
+ putchar ('<');
+ for (j = 0; j < mblen; j++)
+ {
+ if (j > 0)
+ putchar (' ');
+ printf ("%02x", input[offset + i + j]);
+ }
+ putchar ('>');
+ }
+ else
+ {
+ switch (uc)
+ {
+ case ' ':
+ printf ("_");
+ break;
+
+ case '_':
+ printf ("\\_");
+ break;
+
+ case '\\':
+ printf ("\\\\");
+ break;
+
+ case '\t':
+ printf ("\\t");
+ break;
+
+ case '\r':
+ printf ("\\r");
+ break;
+
+ case '\n':
+ printf ("\\n");
+ break;
+
+ case '\v':
+ printf ("\\v");
+ break;
+
+ default:
+ if (uc < 0x20 || uc == 0x00a0)
+ printf ("<U+%04X>", uc);
+ else
+ fwrite (input + offset + i, 1, mblen, stdout);
+ break;
+ }
+ }
+
+ i += mblen;
+ }
+ }
+
+ offset += n;
+ if (type == SEG_NEWLINE)
+ {
+ enum prompt_style prompt;
+
+ line_number++;
+ line_offset = offset;
+
+ prompt = segmenter_get_prompt (&s);
+ printf (" (%s)\n", prompt_style_to_string (prompt));
+ }
+ }
+ putchar ('\n');
+
+ free (input);
+
+ return 0;
+}
+
+static const char *
+parse_options (int argc, char **argv)
+{
+ for (;;)
+ {
+ static const struct option options[] =
+ {
+ {"one-byte", no_argument, NULL, '1'},
+ {"auto", no_argument, NULL, 'a'},
+ {"batch", no_argument, NULL, 'b'},
+ {"interactive", no_argument, NULL, 'i'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"help", no_argument, NULL, 'h'},
+ {NULL, 0, NULL, 0},
+ };
+
+ int c = getopt_long (argc, argv, "1abivh", options, NULL);
+ if (c == -1)
+ break;
+
+ switch (c)
+ {
+ case '1':
+ one_byte = true;
+ break;
+
+ case 'a':
+ mode = SEG_MODE_AUTO;
+ break;
+
+ case 'b':
+ mode = SEG_MODE_BATCH;
+ break;
+
+ case 'i':
+ mode = SEG_MODE_INTERACTIVE;
+ break;
+
+ case 'v':
+ verbose = true;
+ break;
+
+ case 'h':
+ usage ();
+
+ case 0:
+ break;
+
+ case '?':
+ exit (EXIT_FAILURE);
+ break;
+
+ default:
+ NOT_REACHED ();
+ }
+
+ }
+
+ if (optind + 1 != argc)
+ error (1, 0, "exactly one non-option argument required; "
+ "use --help for help");
+ return argv[optind];
+}
+
+static void
+usage (void)
+{
+ printf ("\
+%s, to test breaking PSPP syntax into lexical segments\n\
+usage: %s [OPTIONS] INPUT\n\
+\n\
+Options:\n\
+ -1, --one-byte feed one byte at a time\n\
+ -a, --auto use \"auto\" syntax mode\n\
+ -b, --batch use \"batch\" syntax mode\n\
+ -i, --interactive use \"interactive\" syntax mode (default)\n\
+ -v, --verbose include rows and column numbers in output\n\
+ -h, --help print this help message\n",
+ program_name, program_name);
+ exit (EXIT_SUCCESS);
+}
diff --git a/tests/language/lexer/segment.at b/tests/language/lexer/segment.at
new file mode 100644
index 0000000..e1dd0b5
--- /dev/null
+++ b/tests/language/lexer/segment.at
@@ -0,0 +1,1070 @@
+AT_BANNER([syntax segmentation])
+m4_define([PSPP_CHECK_SEGMENT],
+ [AT_CHECK([segment-test $1 input], [0], [expout])
+ AT_CHECK([segment-test -1 $1 input], [0], [expout])])
+
+AT_SETUP([identifiers])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+a ab abc abcd
+A AB ABC ABCD
+aB aBC aBcD
+$x $y $z
+grève@<00A0>@Ângstrom@<00A0>@poté
+#a #b #c ## #d
address@hidden @ @@. @#@ @&t@
+## # #12345 #.#
address@hidden
+GhIjK
+.x 1y _z
+])
+AT_DATA([expout], [dnl
+identifier a space
+identifier ab space
+identifier abc space
+identifier abcd
+newline \n (later)
+
+identifier A space
+identifier AB space
+identifier ABC space
+identifier ABCD
+newline \n (later)
+
+identifier aB space
+identifier aBC space
+identifier aBcD
+newline \n (later)
+
+identifier $x space
+identifier $y space
+identifier $z
+newline \n (later)
+
+identifier grève
+spaces <U+00A0>
+identifier Ângstrom
+spaces <U+00A0>
+identifier poté
+newline \n (later)
+
+identifier #a space
+identifier #b space
+identifier #c space
+identifier ## space
+identifier #d
+newline \n (later)
+
+identifier @efg space
+identifier @ space
+identifier @@. space
+identifier @#@ space
+newline \n (later)
+
+identifier ## space
+identifier # space
+identifier #12345 space
+identifier #.#
+newline \n (later)
+
+identifier address@hidden
+newline \n (later)
+
+identifier GhIjK
+newline \n (later)
+
+start_command .
+identifier x space
+number 1
+identifier y space
+unexpected_char \_
+identifier z
+newline \n (later)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([identifiers that end in '.'])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+abcd. abcd.
+ABCD. ABCD.
+aBcD. aBcD. @&t@
+$y. $z. あいうえお.
+#c. #d..
+@@. @@....
+#.#.
+#abcd.
+.
+. @&t@
+LMNOP. @&t@
+QRSTUV./* end of line comment */
+qrstuv. /* end of line comment */
+QrStUv./* end of line comment */ @&t@
+wxyz./* unterminated end of line comment
+WXYZ. /* unterminated end of line comment
+WxYz./* unterminated end of line comment @&t@
+])
+AT_DATA([expout], [dnl
+identifier abcd. space
+identifier abcd
+end_command .
+newline \n (first)
+
+identifier ABCD. space
+identifier ABCD
+end_command .
+newline \n (first)
+
+identifier aBcD. space
+identifier aBcD
+end_command . space
+newline \n (first)
+
+identifier $y. space
+identifier $z. space
+identifier あいうえお
+end_command .
+newline \n (first)
+
+identifier #c. space
+identifier #d.
+end_command .
+newline \n (first)
+
+identifier @@. space
+identifier @@...
+end_command .
+newline \n (first)
+
+identifier #.#
+end_command .
+newline \n (first)
+
+identifier #abcd
+end_command .
+newline \n (first)
+
+start_command .
+newline \n (first)
+
+start_command . space
+newline \n (first)
+
+identifier LMNOP
+end_command . space
+newline \n (first)
+
+identifier QRSTUV
+end_command .
+comment /*_end_of_line_comment_*/
+newline \n (first)
+
+identifier qrstuv
+end_command . space
+comment /*_end_of_line_comment_*/
+newline \n (first)
+
+identifier QrStUv
+end_command .
+comment /*_end_of_line_comment_*/ space
+newline \n (first)
+
+identifier wxyz
+end_command .
+comment /*_unterminated_end_of_line_comment
+newline \n (first)
+
+identifier WXYZ
+end_command . space
+comment /*_unterminated_end_of_line_comment
+newline \n (first)
+
+identifier WxYz
+end_command .
+comment /*_unterminated_end_of_line_comment_
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([reserved words])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+and or not eq ge gt le lt ne all by to with
+AND OR NOT EQ GE GT LE LT NE ALL BY TO WITH
+andx orx notx eqx gex gtx lex ltx nex allx byx tox withx
+and. with.
+])
+AT_DATA([expout], [dnl
+reserved_word and space
+reserved_word or space
+reserved_word not space
+reserved_word eq space
+reserved_word ge space
+reserved_word gt space
+reserved_word le space
+reserved_word lt space
+reserved_word ne space
+reserved_word all space
+reserved_word by space
+reserved_word to space
+reserved_word with
+newline \n (later)
+
+reserved_word AND space
+reserved_word OR space
+reserved_word NOT space
+reserved_word EQ space
+reserved_word GE space
+reserved_word GT space
+reserved_word LE space
+reserved_word LT space
+reserved_word NE space
+reserved_word ALL space
+reserved_word BY space
+reserved_word TO space
+reserved_word WITH
+newline \n (later)
+
+identifier andx space
+identifier orx space
+identifier notx space
+identifier eqx space
+identifier gex space
+identifier gtx space
+identifier lex space
+identifier ltx space
+identifier nex space
+identifier allx space
+identifier byx space
+identifier tox space
+identifier withx
+newline \n (later)
+
+identifier and. space
+reserved_word with
+end_command .
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([punctuation])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+~ & | = >= > <= < ~= <> ( ) , - + * / [[ ]] **
+~&|=>=><=<~=<>(),-+*/[[]]**
+])
+AT_DATA([expout], [dnl
+punct ~ space
+punct & space
+punct | space
+punct = space
+punct >= space
+punct > space
+punct <= space
+punct < space
+punct ~= space
+punct <> space
+punct ( space
+punct ) space
+punct , space
+punct - space
+punct + space
+punct * space
+punct / space
+punct [[ space
+punct ]] space
+punct **
+newline \n (later)
+
+punct ~
+punct &
+punct |
+punct =
+punct >=
+punct >
+punct <=
+punct <
+punct ~=
+punct <>
+punct (
+punct )
+punct ,
+punct -
+punct +
+punct *
+punct /
+punct [[
+punct ]]
+punct **
+newline \n (later)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([numbers])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+0 1 01 001. 1.
+123. /* comment 1 */ /* comment 2 */
+.1 0.1 00.1 00.10
+5e1 6E-1 7e+1 6E+01 6e-03
+.3E1 .4e-1 .5E+1 .6e+01 .7E-03
+1.23e1 45.6E-1 78.9e+1 99.9E+01 11.2e-03
+. 1e e1 1e+ 1e-
+])
+AT_DATA([expout], [dnl
+number 0 space
+number 1 space
+number 01 space
+number 001. space
+number 1
+end_command .
+newline \n (first)
+
+number 123
+end_command . space
+comment /*_comment_1_*/ space
+comment /*_comment_2_*/
+newline \n (first)
+
+start_command .
+number 1 space
+number 0.1 space
+number 00.1 space
+number 00.10
+newline \n (later)
+
+number 5e1 space
+number 6E-1 space
+number 7e+1 space
+number 6E+01 space
+number 6e-03
+newline \n (later)
+
+start_command .
+number 3E1 space
+number .4e-1 space
+number .5E+1 space
+number .6e+01 space
+number .7E-03
+newline \n (later)
+
+number 1.23e1 space
+number 45.6E-1 space
+number 78.9e+1 space
+number 99.9E+01 space
+number 11.2e-03
+newline \n (later)
+
+start_command . space
+expected_exponent 1e space
+identifier e1 space
+expected_exponent 1e+ space
+expected_exponent 1e-
+newline \n (later)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([strings])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+'x' "y" 'abc'
+'Don''t' "Can't" 'Won''t'
+"""quoted""" '"quoted"'
+'' ""
+'missing end quote
+"missing double quote
+x"4142" X'5152'
+u'fffd' U"041"
++ new command
++ /* comment */ 'string continuation'
++ /* also a punctuator on blank line
+- 'new command'
+])
+AT_DATA([expout], [dnl
+quoted_string 'x' space
+quoted_string "y" space
+quoted_string 'abc'
+newline \n (later)
+
+quoted_string 'Don''t' space
+quoted_string "Can't" space
+quoted_string 'Won''t'
+newline \n (later)
+
+quoted_string """quoted""" space
+quoted_string '"quoted"'
+newline \n (later)
+
+quoted_string '' space
+quoted_string ""
+newline \n (later)
+
+expected_quote 'missing_end_quote
+newline \n (later)
+
+expected_quote "missing_double_quote
+newline \n (later)
+
+hex_string x"4142" space
+hex_string X'5152'
+newline \n (later)
+
+unicode_string u'fffd' space
+unicode_string U"041"
+newline \n (later)
+
+start_command + space
+identifier new space
+identifier command
+newline \n (later)
+
+punct + space
+comment /*_comment_*/ space
+quoted_string 'string_continuation'
+newline \n (later)
+
+punct + space
+comment /*_also_a_punctuator_on_blank_line
+newline \n (later)
+
+start_command - space
+quoted_string 'new_command'
+newline \n (later)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP(address@hidden:@! construct])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+#! /usr/bin/pspp
+title my title.
+#! /usr/bin/pspp
+])
+AT_DATA([expout], [dnl
+shbang #!_/usr/bin/pspp
+newline \n (first)
+
+identifier title space
+unquoted_string my_title
+end_command .
+newline \n (first)
+
+identifier #
+unexpected_char ! space
+punct /
+identifier usr
+punct /
+identifier bin
+punct /
+identifier pspp
+newline \n (later)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([* and COMMENT commands])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+* Comment commands "don't
+have to contain valid tokens.
+
+** Check ambiguity with ** token.
+****************.
+
+comment keyword works too.
+COMM also.
+com is ambiguous with COMPUTE.
+
+ * Comment need not start at left margin.
+
+* Comment ends with blank line
+
+next command.
+
+])
+AT_DATA([expout], [dnl
+comment_command *_Comment_commands_"don't
+newline \n (COMMENT)
+
+comment_command have_to_contain_valid_tokens
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+comment_command **_Check_ambiguity_with_**_token
+end_command .
+newline \n (first)
+
+comment_command ****************
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+comment_command comment_keyword_works_too
+end_command .
+newline \n (first)
+
+comment_command COMM_also
+end_command .
+newline \n (first)
+
+identifier com space
+identifier is space
+identifier ambiguous space
+reserved_word with space
+identifier COMPUTE
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+spaces ___
+comment_command *_Comment_need_not_start_at_left_margin
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+comment_command *_Comment_ends_with_blank_line
+newline \n (COMMENT)
+
+separate_commands
+newline \n (first)
+
+identifier next space
+identifier command
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([DOCUMENT command])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+DOCUMENT one line.
+DOC more
+ than
+ one
+ line.
+docu
+first.paragraph
+isn't parsed as tokens
+
+second paragraph.
+])
+AT_DATA([expout], [dnl
+start_document
+document DOCUMENT_one_line.
+end_command
+separate_commands
+newline \n (first)
+
+start_document
+document DOC_more
+newline \n (DOCUMENT)
+
+document ____than
+newline \n (DOCUMENT)
+
+document ________one
+newline \n (DOCUMENT)
+
+document ____________line.
+end_command
+separate_commands
+newline \n (first)
+
+start_document
+document docu
+newline \n (DOCUMENT)
+
+document first.paragraph
+newline \n (DOCUMENT)
+
+document isn't_parsed_as_tokens
+newline \n (DOCUMENT)
+
+document
+newline \n (DOCUMENT)
+
+document second_paragraph.
+end_command
+separate_commands
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([TITLE, SUBTITLE, FILE LABEL commands])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+title/**/'Quoted string title'.
+tit /*
+"Quoted string on second line".
+sub "Quoted string subtitle"
+ .
+
+TITL /* Not a */ quoted string title.
+SUBT Not a quoted string /* subtitle
+
+FIL label isn't quoted.
+FILE
+ lab 'is quoted'.
+FILE /*
+/**/ lab not quoted here either
+
+])
+AT_DATA([expout], [dnl
+identifier title
+comment /**/
+quoted_string 'Quoted_string_title'
+end_command .
+newline \n (first)
+
+identifier tit space
+comment /*
+newline \n (later)
+
+quoted_string "Quoted_string_on_second_line"
+end_command .
+newline \n (first)
+
+identifier sub space
+quoted_string "Quoted_string_subtitle"
+newline \n (later)
+ space
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+identifier TITL space
+unquoted_string /*_Not_a_*/_quoted_string_title
+end_command .
+newline \n (first)
+
+identifier SUBT space
+unquoted_string Not_a_quoted_string_/*_subtitle
+newline \n (later)
+
+separate_commands
+newline \n (first)
+
+identifier FIL space
+identifier label space
+unquoted_string isn't_quoted
+end_command .
+newline \n (first)
+
+identifier FILE
+newline \n (later)
+
+spaces __
+identifier lab space
+quoted_string 'is_quoted'
+end_command .
+newline \n (first)
+
+identifier FILE space
+comment /*
+newline \n (later)
+
+comment /**/
+spaces __
+identifier lab space
+unquoted_string not_quoted_here_either
+newline \n (later)
+
+separate_commands
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([BEGIN DATA command])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+begin data.
+end data.
+
+begin data. /*
+123
+xxx
+end data.
+
+BEG /**/ DAT /*
+5 6 7 /* x
+
+end data
+end data
+.
+
+begin
+ data.
+data
+end data.
+
+begin data "xxx".
+begin data 123.
+not data
+])
+AT_DATA([expout], [dnl
+identifier begin space
+identifier data
+end_command .
+newline \n (data)
+
+identifier end space
+identifier data
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+identifier begin space
+identifier data
+end_command . space
+comment /*
+newline \n (data)
+
+inline_data 123
+newline \n (data)
+
+inline_data xxx
+newline \n (data)
+
+identifier end space
+identifier data
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+identifier BEG space
+comment /**/ space
+identifier DAT space
+comment /*
+newline \n (data)
+
+inline_data 5_6_7_/*_x
+newline \n (data)
+
+inline_data
+newline \n (data)
+
+inline_data end__data
+newline \n (data)
+
+identifier end space
+identifier data
+newline \n (later)
+
+start_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+identifier begin
+newline \n (later)
+ space
+identifier data
+end_command .
+newline \n (data)
+
+inline_data data
+newline \n (data)
+
+identifier end space
+identifier data
+end_command .
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+identifier begin space
+identifier data space
+quoted_string "xxx"
+end_command .
+newline \n (first)
+
+identifier begin space
+identifier data space
+number 123
+end_command .
+newline \n (first)
+
+reserved_word not space
+identifier data
+newline \n (later)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([DO REPEAT command])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+do repeat x=a b c
+ y=d e f.
+ do repeat a=1 thru 5.
+another command.
+second command
++ third command.
+end /* x */ /* y */ repeat print.
+end
+ repeat.
+do
+ repeat #a=1.
+ inner command.
+end repeat.
+])
+AT_DATA([expout], [dnl
+identifier do space
+identifier repeat space
+identifier x
+punct =
+identifier a space
+identifier b space
+identifier c
+newline \n (later)
+
+spaces __________
+identifier y
+punct =
+identifier d space
+identifier e space
+identifier f
+end_command .
+newline \n (DO REPEAT)
+
+do_repeat_command __do_repeat_a=1_thru_5.
+newline \n (DO REPEAT)
+
+do_repeat_command another_command.
+newline \n (DO REPEAT)
+
+do_repeat_command second_command
+newline \n (DO REPEAT)
+
+do_repeat_command +_third_command.
+newline \n (DO REPEAT)
+
+do_repeat_command end_/*_x_*/_/*_y_*/_repeat_print.
+newline \n (DO REPEAT)
+
+identifier end
+newline \n (later)
+ space
+identifier repeat
+end_command .
+newline \n (first)
+
+identifier do
+newline \n (later)
+
+spaces __
+identifier repeat space
+identifier #a
+punct =
+number 1
+end_command .
+newline \n (DO REPEAT)
+
+do_repeat_command __inner_command.
+newline \n (DO REPEAT)
+
+identifier end space
+identifier repeat
+end_command .
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-i])
+AT_CLEANUP
+
+AT_SETUP([batch mode])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+first command
+ another line of first command
++ second command
+third command
+
+fourth command.
+ fifth command.
+])
+AT_DATA([expout], [dnl
+identifier first space
+identifier command
+newline \n (later)
+
+spaces _____
+identifier another space
+identifier line space
+identifier of space
+identifier first space
+identifier command
+newline \n (later)
+
+start_command +
+spaces __
+identifier second space
+identifier command
+newline \n (later)
+
+start_command
+identifier third space
+identifier command
+newline \n (later)
+
+separate_commands
+newline \n (first)
+
+identifier fourth space
+identifier command
+end_command .
+newline \n (first)
+
+spaces ___
+identifier fifth space
+identifier command
+end_command .
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-b])
+AT_CLEANUP
+
+AT_SETUP([auto mode])
+AT_KEYWORDS([segment])
+AT_DATA([input], [dnl
+command
+ another line of command
+2sls
++ another command
+another line of second command
+data list /x 1
+aggregate.
+print eject.
+twostep cluster
+
+
+fourth command.
+ fifth command.
+])
+AT_DATA([expout], [dnl
+identifier command
+newline \n (later)
+
+spaces _____
+identifier another space
+identifier line space
+identifier of space
+identifier command
+newline \n (later)
+
+start_command
+number 2
+identifier sls
+newline \n (later)
+
+start_command +
+spaces __
+identifier another space
+identifier command
+newline \n (later)
+
+identifier another space
+identifier line space
+identifier of space
+identifier second space
+identifier command
+newline \n (later)
+
+start_command
+identifier data space
+identifier list space
+punct /
+identifier x space
+number 1
+newline \n (later)
+
+start_command
+identifier aggregate
+end_command .
+newline \n (first)
+
+identifier print space
+identifier eject
+end_command .
+newline \n (first)
+
+identifier twostep space
+identifier cluster
+newline \n (later)
+
+separate_commands
+newline \n (first)
+
+separate_commands
+newline \n (first)
+
+identifier fourth space
+identifier command
+end_command .
+newline \n (first)
+
+spaces ___
+identifier fifth space
+identifier command
+end_command .
+newline \n (first)
+
+end <U+0000>
+])
+PSPP_CHECK_SEGMENT([-a])
+AT_CLEANUP
--
1.7.2.3
- [PATCH 04/18] output: New function text_item_create_nocopy()., (continued)
- [PATCH 04/18] output: New function text_item_create_nocopy()., Ben Pfaff, 2011/03/19
- [PATCH 05/18] str: New function ss_realloc()., Ben Pfaff, 2011/03/19
- [PATCH 13/18] i18n: New functions and data structure for obtaining encoding info., Ben Pfaff, 2011/03/19
- [PATCH 06/18] str: Rename ss_chomp() to ss_chomp_byte(), ds_chomp() to ds_chomp_byte()., Ben Pfaff, 2011/03/19
- [PATCH 02/18] file-name: Do not make output files line-buffered in fn_open()., Ben Pfaff, 2011/03/19
- [PATCH 12/18] identifier: Rename token_type_to_string() and make a new version., Ben Pfaff, 2011/03/19
- [PATCH 08/18] hash-functions: New function hash_case_bytes()., Ben Pfaff, 2011/03/19
- [PATCH 11/18] i18n: New functions for truncating strings in an arbitrary encoding., Ben Pfaff, 2011/03/19
- [PATCH 17/18] scan: New library for high-level PSPP syntax lexical analysis., Ben Pfaff, 2011/03/19
- [PATCH 15/18] u8-istream: New library for reading a text file and recoding to UTF-8., Ben Pfaff, 2011/03/19
- [PATCH 16/18] segment: New library for low-level phase of lexical syntax analysis.,
Ben Pfaff <=
- [PATCH 03/18] sys-file-reader: Refactor to clean up character encoding support., Ben Pfaff, 2011/03/19
- Re: [PATCH 00/18] rewrite PSPP lexer, John Darrington, 2011/03/20
- Re: [PATCH 00/18] rewrite PSPP lexer, John Darrington, 2011/03/22