[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: Protect vertical tab in XML
From: |
Patrice Dumas |
Subject: |
branch master updated: Protect vertical tab in XML |
Date: |
Mon, 28 Nov 2022 19:00:21 -0500 |
This is an automated email from the git hooks/post-receive script.
pertusus pushed a commit to branch master
in repository texinfo.
The following commit(s) were added to refs/heads/master by this push:
new d171e325c6 Protect vertical tab in XML
d171e325c6 is described below
commit d171e325c63f2c36e4fe4ab1a53468a338478826
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Tue Nov 29 01:00:10 2022 +0100
Protect vertical tab in XML
* tp/Texinfo/Convert/TexinfoMarkup.pm
(_protect_in_spaces_attribute_text), tp/Texinfo/Convert/TexinfoXML.pm
(_xml_attributes, _protect_text), util/txixml2texi.pl,
util/texinfo.dtd: protect vertical tab U+000B as \v in attributes and
with verticaltab added entities in other contexts.
---
ChangeLog | 10 +++++
tp/Texinfo/Convert/TexinfoMarkup.pm | 9 ++++-
tp/Texinfo/Convert/TexinfoXML.pm | 39 +++++++++++++++----
tp/t/30sectioning.t | 1 +
.../sectioning/in_menu_only_special_spaces_node.pl | 44 ++++++++++++++++++++++
.../results/sectioning/only_special_spaces_node.pl | 34 +++++++++++++++++
util/texinfo.dtd | 3 ++
util/txixml2texi.pl | 8 +++-
8 files changed, 138 insertions(+), 10 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 0dc475bb6a..f0528f60f9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2022-11-28 Patrice Dumas <pertusus@free.fr>
+
+ Protect vertical tab in XML
+
+ * tp/Texinfo/Convert/TexinfoMarkup.pm
+ (_protect_in_spaces_attribute_text), tp/Texinfo/Convert/TexinfoXML.pm
+ (_xml_attributes, _protect_text), util/txixml2texi.pl,
+ util/texinfo.dtd: protect vertical tab U+000B as \v in attributes and
+ with verticaltab added entities in other contexts.
+
2022-11-28 Patrice Dumas <pertusus@free.fr>
XML menu leading text and menu separator in elements instead of
attributes
diff --git a/tp/Texinfo/Convert/TexinfoMarkup.pm
b/tp/Texinfo/Convert/TexinfoMarkup.pm
index d78028e6c3..75ffbd6480 100644
--- a/tp/Texinfo/Convert/TexinfoMarkup.pm
+++ b/tp/Texinfo/Convert/TexinfoMarkup.pm
@@ -446,12 +446,19 @@ sub convert_tree($$)
return $self->_convert($root);
}
-# FIXME is that function markup format specific or not?
+# FIXME that function is markup format specific, it only works if \ is not
+# special in the markup language
sub _protect_in_spaces_attribute_text($)
{
my $text = shift;
$text =~ s/\n/\\n/g;
+ # protect formfeed in space attributes. It is necessary for XML 1.0
+ # (and most likely XML 1.1) and probably a good thing in other formats.
$text =~ s/\f/\\f/g;
+ # \v does not match U+000B vertical tab, but matches diverse vertical spaces.
+ # We nevertheless use \v here to represent ^K as is customarily done in other
+ # contexts.
+ $text =~ s/\N{U+000B}/\\v/g;
return $text;
}
diff --git a/tp/Texinfo/Convert/TexinfoXML.pm b/tp/Texinfo/Convert/TexinfoXML.pm
index 1b9d02df6e..fc423647e6 100644
--- a/tp/Texinfo/Convert/TexinfoXML.pm
+++ b/tp/Texinfo/Convert/TexinfoXML.pm
@@ -71,6 +71,27 @@ my %special_xml_attributes = (
'verbatim' => {'space' => 'xml:space'},
);
+# Both attributes and CData are defined in term of chars:
+# Char is defined as: https://www.w3.org/TR/REC-xml/#charsets
+# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF]
+# A corresponding character class regexp could be
+# [^\x09\x0A\x0D\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]
+# or
+# [\x00-\x08 \x0B \x0C \x0E-\x19]
+
+# Attributes are defined as https://www.w3.org/TR/REC-xml/#NT-AttValue
+# AttValue ::= '"' ([^<&"] | Reference)* '"'
+# | "'" ([^<&'] | Reference)* "'"
+# Reference as https://www.w3.org/TR/REC-xml/#NT-Reference
+# Reference ::= EntityRef | CharRef
+# Next CharRef is defined as https://www.w3.org/TR/REC-xml/#NT-CharRef
+# CharRef ::= '&#' [0-9]+ ';'
+# | '&#x' [0-9a-fA-F]+ ';'
+# With the additional constraint that
+# Characters referred to using character references MUST match the production
for Char.
+# Which means that numerical entities used in attributes should correspond to
+# characters in the range of acceptable characters. For example form feed is
not
+# in that range, such that both \f and  are invalid.
sub _xml_attributes($$)
{
my $self = shift;
@@ -84,18 +105,18 @@ sub _xml_attributes($$)
if (ref($attribute_spec) ne 'ARRAY') {
cluck "attribute_spec not an array($attribute_spec).";
}
- # this cannot be used because of formfeed, as '<', which
- # is substituted from &formfeed; is not allowed in attribute.
+ # _protect_text cannot be used because of formfeed and verticaltab,
+ # as they become elements, which are not allowed in attribute.
#my $text = $self->_protect_text($attribute_spec->[1]);
my $text = $self->xml_protect_text($attribute_spec->[1]);
- # in fact form feed is not allowed at all in XML, even protected
- # and even in xml 1.1 in contrast to what is said on internet.
- # maybe this is a limitation of libxml?
- #$text =~ s/\f//g;
+ # Form feed/vertical tab U+000B are not allowed at all in XML attributes,
+ # even protected (and even in xml 1.1 in contrast to what is said on
+ # internet). Cf above the full explanation for XML 1.0.
if ($attribute_spec->[0] ne 'spaces'
and $attribute_spec->[0] ne 'trailingspaces') {
$text =~ s/\f/&attrformfeed;/g;
- # &attrformfeed; resolves to \f so \ are doubled
+ $text =~ s/\N{U+000B}/&attrverticaltab;/g;
+ # &attrformfeed; and similar resolves to \f and similar so \ are doubled
$text =~ s/\\/\\\\/g;
}
my $attribute_name = $attribute_spec->[0];
@@ -164,12 +185,16 @@ sub txi_markup_comment($$)
}
# form feed is not accepted in xml, replace it.
+# The CData symbol is defined in terms of Char:
https://www.w3.org/TR/REC-xml/#sec-cdata-sect
+# CData ::= (Char* - (Char* ']]>' Char*))
sub _protect_text($$)
{
my $self = shift;
my $text = shift;
my $result = $self->xml_protect_text($text);
$result =~ s/\f/&formfeed;/g;
+ # \v matches many vertical spaces and not vertical tab U+000B
+ $result =~ s/\N{U+000B}/&verticaltab;/g;
return $result;
}
diff --git a/tp/t/30sectioning.t b/tp/t/30sectioning.t
index 2e12768b89..070a36af7c 100644
--- a/tp/t/30sectioning.t
+++ b/tp/t/30sectioning.t
@@ -2144,6 +2144,7 @@ my @xml_tests_info_tests = ('part_chapter_after_top',
'node_part_chapter_after_chapter', 'section_before_top',
'section_node_before_part', 'top_node_part_top',
'chapter_node_before_and_after_part',
+ 'in_menu_only_special_spaces_node', 'only_special_spaces_node',
'more_nodes_than_sections', 'part_node_chapter_appendix',
'part_node_part_appendix', 'part_node_chapter_node_appendix',
'part_node_part_node_appendix', 'part_node_node_part_appendix',
diff --git a/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl
b/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl
index c764b034bf..aa9c8bac98 100644
--- a/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl
+++ b/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl
@@ -1413,4 +1413,48 @@ Previous: <a href="#g_t_180e_2003" accesskey="p"
rel="prev">MONGOLIAN VOWEL SEPA
</html>
';
+
+$result_converted{'xml'}->{'in_menu_only_special_spaces_node'} =
'<preamblebeforebeginning>
+</preamblebeforebeginning><node name="Top" spaces="
"><nodename>Top</nodename><nodenext automatic="on"> </nodenext></node>
+<top spaces=" "><sectiontitle>top</sectiontitle>
+
+<menu endspaces=" ">
+<menuentry><menuleadingtext>* </menuleadingtext><menunode>
</menunode><menuseparator>::</menuseparator><menudescription><pre
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>*
&formfeed;&verticaltab;</menuleadingtext><menunode></menunode><menuseparator>::</menuseparator><menudescription><pre
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>*
'."\r".'</menuleadingtext><menunode></menunode><menuseparator>::</menuseparator><menudescription><pre
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>*
</menuleadingtext><menunode>
</menunode><menuseparator>::</menuseparator><menudescription><pre
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>*
</menuleadingtext><menunode>
</menunode><menuseparator>::</menuseparator><menudescription><pre
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>*
</menuleadingtext><menunode>
</menunode><menuseparator>::</menuseparator><menudescription><pre
xml:space="preserve">
+</pre></menudescription></menuentry></menu>
+
+
+</top>
+<node name="_2002_2003_2002" spaces=" "><nodename> </nodename><nodenext
automatic="on"></nodenext><nodeprev automatic="on">Top</nodeprev><nodeup
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>EN QUAD| | EM QUAD| | EN SPACE|
|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces=" \\f\\v"></nodename></node>
+<chapter spaces=" "><sectiontitle>CHARACTER TABULATION| | FORM
FEED|&formfeed;| LINE TABULATION|&verticaltab;|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces=" '."\r".'"></nodename></node>
+<chapter spaces=" "><sectiontitle>CARRIAGE RETURN|'."\r".'|</sectiontitle>
+
+</chapter>
+<node name="_0085_00a0_1680" spaces=" "><nodename>
</nodename><nodenext
automatic="on"> </nodenext><nodeprev automatic="on"></nodeprev><nodeup
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>NEXT LINE (NEL)|
| NO-BREAK SPACE| | OGHAM
SPACE MARK| |</sectiontitle>
+
+</chapter>
+<node name="_180e_2003" spaces=" "><nodename> </nodename><nodenext
automatic="on">
</nodenext><nodeprev automatic="on">
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>MONGOLIAN VOWEL SEPARATOR|| EM SPACE|
|</sectiontitle>
+
+</chapter>
+<node name="_2004_2005_2006_2007_2008_2009_200a_2028_2029_202f_205f_3000"
spaces=" "><nodename>
</nodename><nodeprev automatic="on">
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+
+<chapter spaces=" "><sectiontitle>THREE-PER-EM SPACE| | FOUR-PER-EM SPACE| |
SIX-PER-EM SPACE| | FIGURE SPACE| | PUNCTUATION SPACE| | THIN SPACE| | HAIR
SPACE| | LINE SEPARATOR|
| PARAGRAPH SEPARATOR|
| NARROW NO-BREAK SPACE| |
MEDIUM MATHEMATICAL SPACE| | IDEOGRAPHIC SPACE| |</sectiontitle>
+
+</chapter>
+<bye></bye>
+';
+
1;
diff --git a/tp/t/results/sectioning/only_special_spaces_node.pl
b/tp/t/results/sectioning/only_special_spaces_node.pl
index c1b9041ee2..e374085005 100644
--- a/tp/t/results/sectioning/only_special_spaces_node.pl
+++ b/tp/t/results/sectioning/only_special_spaces_node.pl
@@ -974,4 +974,38 @@ Previous: <a href="#g_t_180e_2003" accesskey="p"
rel="prev">MONGOLIAN VOWEL SEPA
</html>
';
+
+$result_converted{'xml'}->{'only_special_spaces_node'} =
'<preamblebeforebeginning>
+</preamblebeforebeginning><node name="Top" spaces="
"><nodename>Top</nodename><nodenext automatic="on"> </nodenext></node>
+<top spaces=" "><sectiontitle>top</sectiontitle>
+
+</top>
+<node name="_2002_2003_2002" spaces=" "><nodename> </nodename><nodenext
automatic="on"></nodenext><nodeprev automatic="on">Top</nodeprev><nodeup
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>EN QUAD| | EM QUAD| | EN SPACE|
|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces=" \\f\\v"></nodename></node>
+<chapter spaces=" "><sectiontitle>CHARACTER TABULATION| | FORM
FEED|&formfeed;| LINE TABULATION|&verticaltab;|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces=" '."\r".'"></nodename></node>
+<chapter spaces=" "><sectiontitle>CARRIAGE RETURN|'."\r".'|</sectiontitle>
+
+</chapter>
+<node name="_0085_00a0_1680" spaces=" "><nodename>
</nodename><nodenext
automatic="on"> </nodenext><nodeprev automatic="on"></nodeprev><nodeup
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>NEXT LINE (NEL)|
| NO-BREAK SPACE| | OGHAM
SPACE MARK| |</sectiontitle>
+
+</chapter>
+<node name="_180e_2003" spaces=" "><nodename> </nodename><nodenext
automatic="on">
</nodenext><nodeprev automatic="on">
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>MONGOLIAN VOWEL SEPARATOR|| EM SPACE|
|</sectiontitle>
+
+</chapter>
+<node name="_2004_2005_2006_2007_2008_2009_200a_2028_2029_202f_205f_3000"
spaces=" "><nodename>
</nodename><nodeprev automatic="on">
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+
+<chapter spaces=" "><sectiontitle>THREE-PER-EM SPACE| | FOUR-PER-EM SPACE| |
SIX-PER-EM SPACE| | FIGURE SPACE| | PUNCTUATION SPACE| | THIN SPACE| | HAIR
SPACE| | LINE SEPARATOR|
| PARAGRAPH SEPARATOR|
| NARROW NO-BREAK SPACE| |
MEDIUM MATHEMATICAL SPACE| | IDEOGRAPHIC SPACE| |</sectiontitle>
+
+</chapter>
+<bye></bye>
+';
+
1;
diff --git a/util/texinfo.dtd b/util/texinfo.dtd
index 060e89dde7..47598710a9 100644
--- a/util/texinfo.dtd
+++ b/util/texinfo.dtd
@@ -1331,6 +1331,7 @@
<!ELEMENT linebreak EMPTY>
<!ELEMENT noeos EMPTY>
<!ELEMENT formfeed EMPTY>
+<!ELEMENT verticaltab EMPTY>
<!ELEMENT divideheading EMPTY>
<!ENTITY tex "<logo>TeX</logo>">
@@ -1380,6 +1381,8 @@ Use exactly what is on the XML specification
<!ENTITY textndash "–">
<!ENTITY formfeed "<formfeed/>">
<!ENTITY attrformfeed "\f">
+<!ENTITY verticaltab "<verticaltab/>">
+<!ENTITY attrverticaltab "\v">
<!ENTITY period "<punct end-of-sentence='no'>.</punct>">
<!ENTITY eosperiod "<punct end-of-sentence='yes'>.</punct>">
<!ENTITY quest "<punct end-of-sentence='no'>?</punct>">
diff --git a/util/txixml2texi.pl b/util/txixml2texi.pl
index 2ae469b4a3..9510bd19b7 100755
--- a/util/txixml2texi.pl
+++ b/util/txixml2texi.pl
@@ -140,9 +140,11 @@ my %entity_texts = (
'textrsquo' => "'",
'textlsquo' => '`',
'formfeed' => "\f",
- # this is not used in pratice, as attrformfeed appears in an
- # attribute and thus is already expanded to text.
+ 'verticaltab' => "\x{000B}",
+ # following mappings are not used in pratice, as attrformfeed and similar
+ # appear in attributes and thus are already expanded to text.
'attrformfeed' => "\f",
+ 'attrverticaltab' => "\x{000B}",
);
# contains nobrace symbol and brace noarg commands
@@ -288,7 +290,9 @@ while ($reader->read) {
my $spaces = $reader->getAttribute('spaces');
if (defined($spaces)) {
$spaces =~ s/\\n/\n/g;
+ # convert back formfeed and other special characters
$spaces =~ s/\\f/\f/g;
+ $spaces =~ s/\\v/\x{000B}/g;
} else {
$spaces = '';
}
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- branch master updated: Protect vertical tab in XML,
Patrice Dumas <=