branch master updated: Protect vertical tab in XML

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: Protect vertical tab in XML

From:	Patrice Dumas
Subject:	branch master updated: Protect vertical tab in XML
Date:	Mon, 28 Nov 2022 19:00:21 -0500
This is an automated email from the git hooks/post-receive script.

pertusus pushed a commit to branch master
in repository texinfo.

The following commit(s) were added to refs/heads/master by this push:
     new d171e325c6 Protect vertical tab in XML
d171e325c6 is described below

commit d171e325c63f2c36e4fe4ab1a53468a338478826
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Tue Nov 29 01:00:10 2022 +0100

    Protect vertical tab in XML
    
    * tp/Texinfo/Convert/TexinfoMarkup.pm
    (_protect_in_spaces_attribute_text), tp/Texinfo/Convert/TexinfoXML.pm
    (_xml_attributes, _protect_text), util/txixml2texi.pl,
    util/texinfo.dtd: protect vertical tab U+000B as \v in attributes and
    with verticaltab added entities in other contexts.
---
 ChangeLog                                          | 10 +++++
 tp/Texinfo/Convert/TexinfoMarkup.pm                |  9 ++++-
 tp/Texinfo/Convert/TexinfoXML.pm                   | 39 +++++++++++++++----
 tp/t/30sectioning.t                                |  1 +
 .../sectioning/in_menu_only_special_spaces_node.pl | 44 ++++++++++++++++++++++
 .../results/sectioning/only_special_spaces_node.pl | 34 +++++++++++++++++
 util/texinfo.dtd                                   |  3 ++
 util/txixml2texi.pl                                |  8 +++-
 8 files changed, 138 insertions(+), 10 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 0dc475bb6a..f0528f60f9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2022-11-28  Patrice Dumas  <pertusus@free.fr>
+
+       Protect vertical tab in XML
+
+       * tp/Texinfo/Convert/TexinfoMarkup.pm
+       (_protect_in_spaces_attribute_text), tp/Texinfo/Convert/TexinfoXML.pm
+       (_xml_attributes, _protect_text), util/txixml2texi.pl,
+       util/texinfo.dtd: protect vertical tab U+000B as \v in attributes and
+       with verticaltab added entities in other contexts.
+
 2022-11-28  Patrice Dumas  <pertusus@free.fr>
 
        XML menu leading text and menu separator in elements instead of 
attributes
diff --git a/tp/Texinfo/Convert/TexinfoMarkup.pm 
b/tp/Texinfo/Convert/TexinfoMarkup.pm
index d78028e6c3..75ffbd6480 100644
--- a/tp/Texinfo/Convert/TexinfoMarkup.pm
+++ b/tp/Texinfo/Convert/TexinfoMarkup.pm
@@ -446,12 +446,19 @@ sub convert_tree($$)
   return $self->_convert($root);
 }
 
-# FIXME is that function markup format specific or not?
+# FIXME that function is markup format specific, it only works if \ is not
+# special in the markup language
 sub _protect_in_spaces_attribute_text($)
 {
   my $text = shift;
   $text =~ s/\n/\\n/g;
+  # protect formfeed in space attributes.  It is necessary for XML 1.0
+  # (and most likely XML 1.1) and probably a good thing in other formats.
   $text =~ s/\f/\\f/g;
+  # \v does not match U+000B vertical tab, but matches diverse vertical spaces.
+  # We nevertheless use \v here to represent ^K as is customarily done in other
+  # contexts.
+  $text =~ s/\N{U+000B}/\\v/g;
   return $text;
 }
 
diff --git a/tp/Texinfo/Convert/TexinfoXML.pm b/tp/Texinfo/Convert/TexinfoXML.pm
index 1b9d02df6e..fc423647e6 100644
--- a/tp/Texinfo/Convert/TexinfoXML.pm
+++ b/tp/Texinfo/Convert/TexinfoXML.pm
@@ -71,6 +71,27 @@ my %special_xml_attributes = (
   'verbatim' => {'space' => 'xml:space'},
 );
 
+# Both attributes and CData are defined in term of chars:
+# Char is defined as: https://www.w3.org/TR/REC-xml/#charsets
+# Char    ::=          #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 
[#x10000-#x10FFFF]
+# A corresponding character class regexp could be
+# [^\x09\x0A\x0D\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]
+# or
+# [\x00-\x08 \x0B \x0C \x0E-\x19]
+
+# Attributes are defined as https://www.w3.org/TR/REC-xml/#NT-AttValue
+#      AttValue           ::=          '"' ([^<&"] | Reference)* '"'
+#                      |  "'" ([^<&'] | Reference)* "'"
+# Reference as https://www.w3.org/TR/REC-xml/#NT-Reference
+# Reference       ::=          EntityRef | CharRef
+# Next CharRef is defined as https://www.w3.org/TR/REC-xml/#NT-CharRef
+# CharRef         ::=          '&#' [0-9]+ ';'
+#                      | '&#x' [0-9a-fA-F]+ ';'
+# With the additional constraint that
+# Characters referred to using character references MUST match the production 
for Char.
+# Which means that numerical entities used in attributes should correspond to
+# characters in the range of acceptable characters.  For example form feed is 
not
+# in that range, such that both \f and &#12; are invalid.
 sub _xml_attributes($$)
 {
   my $self = shift;
@@ -84,18 +105,18 @@ sub _xml_attributes($$)
     if (ref($attribute_spec) ne 'ARRAY') {
        cluck "attribute_spec not an array($attribute_spec).";
     }
-    # this cannot be used because of formfeed, as '<', which
-    # is substituted from &formfeed; is not allowed in attribute.
+    # _protect_text cannot be used because of formfeed and verticaltab,
+    # as they become elements, which are not allowed in attribute.
     #my $text = $self->_protect_text($attribute_spec->[1]);
     my $text = $self->xml_protect_text($attribute_spec->[1]);
-    # in fact form feed is not allowed at all in XML, even protected
-    # and even in xml 1.1 in contrast to what is said on internet.
-    # maybe this is a limitation of libxml?
-    #$text =~ s/\f/&#12;/g;
+    # Form feed/vertical tab U+000B are not allowed at all in XML attributes,
+    # even protected (and even in xml 1.1 in contrast to what is said on
+    # internet).  Cf above the full explanation for XML 1.0.
     if ($attribute_spec->[0] ne 'spaces'
         and $attribute_spec->[0] ne 'trailingspaces') {
       $text =~ s/\f/&attrformfeed;/g;
-      # &attrformfeed; resolves to \f so \ are doubled
+      $text =~ s/\N{U+000B}/&attrverticaltab;/g;
+      # &attrformfeed; and similar resolves to \f and similar so \ are doubled
       $text =~ s/\\/\\\\/g;
     }
     my $attribute_name = $attribute_spec->[0];
@@ -164,12 +185,16 @@ sub txi_markup_comment($$)
 }
 
 # form feed is not accepted in xml, replace it.
+# The CData symbol is defined in terms of Char: 
https://www.w3.org/TR/REC-xml/#sec-cdata-sect
+# CData           ::=          (Char* - (Char* ']]>' Char*))
 sub _protect_text($$)
 {
   my $self = shift;
   my $text = shift;
   my $result = $self->xml_protect_text($text);
   $result =~ s/\f/&formfeed;/g;
+  # \v matches many vertical spaces and not vertical tab U+000B
+  $result =~ s/\N{U+000B}/&verticaltab;/g;
   return $result;
 }
 
diff --git a/tp/t/30sectioning.t b/tp/t/30sectioning.t
index 2e12768b89..070a36af7c 100644
--- a/tp/t/30sectioning.t
+++ b/tp/t/30sectioning.t
@@ -2144,6 +2144,7 @@ my @xml_tests_info_tests = ('part_chapter_after_top',
   'node_part_chapter_after_chapter', 'section_before_top',
   'section_node_before_part', 'top_node_part_top',
   'chapter_node_before_and_after_part',
+  'in_menu_only_special_spaces_node', 'only_special_spaces_node',
   'more_nodes_than_sections', 'part_node_chapter_appendix',
   'part_node_part_appendix', 'part_node_chapter_node_appendix',
   'part_node_part_node_appendix', 'part_node_node_part_appendix',
diff --git a/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl 
b/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl
index c764b034bf..aa9c8bac98 100644
--- a/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl
+++ b/tp/t/results/sectioning/in_menu_only_special_spaces_node.pl
@@ -1413,4 +1413,48 @@ Previous: <a href="#g_t_180e_2003" accesskey="p" 
rel="prev">MONGOLIAN VOWEL SEPA
 </html>
 ';
 
+
+$result_converted{'xml'}->{'in_menu_only_special_spaces_node'} = 
'<preamblebeforebeginning>
+</preamblebeforebeginning><node name="Top" spaces=" 
"><nodename>Top</nodename><nodenext automatic="on">   </nodenext></node>
+<top spaces=" "><sectiontitle>top</sectiontitle>
+
+<menu endspaces=" ">
+<menuentry><menuleadingtext>* </menuleadingtext><menunode>   
</menunode><menuseparator>::</menuseparator><menudescription><pre 
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>*      
&formfeed;&verticaltab;</menuleadingtext><menunode></menunode><menuseparator>::</menuseparator><menudescription><pre
 xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>* 
'."\r".'</menuleadingtext><menunode></menunode><menuseparator>::</menuseparator><menudescription><pre
 xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>* 
</menuleadingtext><menunode>  
</menunode><menuseparator>::</menuseparator><menudescription><pre 
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>* 
</menuleadingtext><menunode>᠎ 
</menunode><menuseparator>::</menuseparator><menudescription><pre 
xml:space="preserve">
+</pre></menudescription></menuentry><menuentry><menuleadingtext>* 
</menuleadingtext><menunode>           　
</menunode><menuseparator>::</menuseparator><menudescription><pre 
xml:space="preserve">
+</pre></menudescription></menuentry></menu>
+
+
+</top>
+<node name="_2002_2003_2002" spaces=" "><nodename>   </nodename><nodenext 
automatic="on"></nodenext><nodeprev automatic="on">Top</nodeprev><nodeup 
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>EN QUAD| | EM QUAD| | EN SPACE| 
|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces="       \\f\\v"></nodename></node>
+<chapter spaces=" "><sectiontitle>CHARACTER TABULATION|        | FORM 
FEED|&formfeed;| LINE TABULATION|&verticaltab;|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces=" '."\r".'"></nodename></node>
+<chapter spaces=" "><sectiontitle>CARRIAGE RETURN|'."\r".'|</sectiontitle>
+
+</chapter>
+<node name="_0085_00a0_1680" spaces=" "><nodename>  </nodename><nodenext 
automatic="on">᠎ </nodenext><nodeprev automatic="on"></nodeprev><nodeup 
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>NEXT LINE (NEL)|| NO-BREAK SPACE| | OGHAM 
SPACE MARK| |</sectiontitle>
+
+</chapter>
+<node name="_180e_2003" spaces=" "><nodename>᠎ </nodename><nodenext 
automatic="on">           　</nodenext><nodeprev automatic="on">  
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>MONGOLIAN VOWEL SEPARATOR|᠎| EM SPACE| 
|</sectiontitle>
+
+</chapter>
+<node name="_2004_2005_2006_2007_2008_2009_200a_2028_2029_202f_205f_3000" 
spaces=" "><nodename>           　</nodename><nodeprev automatic="on">᠎ 
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+
+<chapter spaces=" "><sectiontitle>THREE-PER-EM SPACE| | FOUR-PER-EM SPACE| | 
SIX-PER-EM SPACE| | FIGURE SPACE| | PUNCTUATION SPACE| | THIN SPACE| | HAIR 
SPACE| | LINE SEPARATOR| | PARAGRAPH SEPARATOR| | NARROW NO-BREAK SPACE| | 
MEDIUM MATHEMATICAL SPACE| | IDEOGRAPHIC SPACE|　|</sectiontitle>
+
+</chapter>
+<bye></bye>
+';
+
 1;
diff --git a/tp/t/results/sectioning/only_special_spaces_node.pl 
b/tp/t/results/sectioning/only_special_spaces_node.pl
index c1b9041ee2..e374085005 100644
--- a/tp/t/results/sectioning/only_special_spaces_node.pl
+++ b/tp/t/results/sectioning/only_special_spaces_node.pl
@@ -974,4 +974,38 @@ Previous: <a href="#g_t_180e_2003" accesskey="p" 
rel="prev">MONGOLIAN VOWEL SEPA
 </html>
 ';
 
+
+$result_converted{'xml'}->{'only_special_spaces_node'} = 
'<preamblebeforebeginning>
+</preamblebeforebeginning><node name="Top" spaces=" 
"><nodename>Top</nodename><nodenext automatic="on">   </nodenext></node>
+<top spaces=" "><sectiontitle>top</sectiontitle>
+
+</top>
+<node name="_2002_2003_2002" spaces=" "><nodename>   </nodename><nodenext 
automatic="on"></nodenext><nodeprev automatic="on">Top</nodeprev><nodeup 
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>EN QUAD| | EM QUAD| | EN SPACE| 
|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces="       \\f\\v"></nodename></node>
+<chapter spaces=" "><sectiontitle>CHARACTER TABULATION|        | FORM 
FEED|&formfeed;| LINE TABULATION|&verticaltab;|</sectiontitle>
+
+</chapter>
+<node name=""><nodename trailingspaces=" '."\r".'"></nodename></node>
+<chapter spaces=" "><sectiontitle>CARRIAGE RETURN|'."\r".'|</sectiontitle>
+
+</chapter>
+<node name="_0085_00a0_1680" spaces=" "><nodename>  </nodename><nodenext 
automatic="on">᠎ </nodenext><nodeprev automatic="on"></nodeprev><nodeup 
automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>NEXT LINE (NEL)|| NO-BREAK SPACE| | OGHAM 
SPACE MARK| |</sectiontitle>
+
+</chapter>
+<node name="_180e_2003" spaces=" "><nodename>᠎ </nodename><nodenext 
automatic="on">           　</nodenext><nodeprev automatic="on">  
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+<chapter spaces=" "><sectiontitle>MONGOLIAN VOWEL SEPARATOR|᠎| EM SPACE| 
|</sectiontitle>
+
+</chapter>
+<node name="_2004_2005_2006_2007_2008_2009_200a_2028_2029_202f_205f_3000" 
spaces=" "><nodename>           　</nodename><nodeprev automatic="on">᠎ 
</nodeprev><nodeup automatic="on">Top</nodeup></node>
+
+<chapter spaces=" "><sectiontitle>THREE-PER-EM SPACE| | FOUR-PER-EM SPACE| | 
SIX-PER-EM SPACE| | FIGURE SPACE| | PUNCTUATION SPACE| | THIN SPACE| | HAIR 
SPACE| | LINE SEPARATOR| | PARAGRAPH SEPARATOR| | NARROW NO-BREAK SPACE| | 
MEDIUM MATHEMATICAL SPACE| | IDEOGRAPHIC SPACE|　|</sectiontitle>
+
+</chapter>
+<bye></bye>
+';
+
 1;
diff --git a/util/texinfo.dtd b/util/texinfo.dtd
index 060e89dde7..47598710a9 100644
--- a/util/texinfo.dtd
+++ b/util/texinfo.dtd
@@ -1331,6 +1331,7 @@
 <!ELEMENT linebreak EMPTY>
 <!ELEMENT noeos EMPTY>
 <!ELEMENT formfeed EMPTY>
+<!ELEMENT verticaltab EMPTY>
 <!ELEMENT divideheading EMPTY>
 
 <!ENTITY tex        "<logo>TeX</logo>">
@@ -1380,6 +1381,8 @@ Use exactly what is on the XML specification
 <!ENTITY textndash      "&#x2013;">
 <!ENTITY formfeed       "<formfeed/>">
 <!ENTITY attrformfeed   "\f">
+<!ENTITY verticaltab       "<verticaltab/>">
+<!ENTITY attrverticaltab   "\v">
 <!ENTITY period     "<punct end-of-sentence='no'>.</punct>">
 <!ENTITY eosperiod  "<punct end-of-sentence='yes'>.</punct>">
 <!ENTITY quest      "<punct end-of-sentence='no'>?</punct>">
diff --git a/util/txixml2texi.pl b/util/txixml2texi.pl
index 2ae469b4a3..9510bd19b7 100755
--- a/util/txixml2texi.pl
+++ b/util/txixml2texi.pl
@@ -140,9 +140,11 @@ my %entity_texts = (
   'textrsquo' => "'",
   'textlsquo' => '`',
   'formfeed' => "\f",
-  # this is not used in pratice, as attrformfeed appears in an
-  # attribute and thus is already expanded to text.
+  'verticaltab' => "\x{000B}",
+  # following mappings are not used in pratice, as attrformfeed and similar
+  # appear in attributes and thus are already expanded to text.
   'attrformfeed' => "\f",
+  'attrverticaltab' => "\x{000B}",
 );
 
 # contains nobrace symbol and brace noarg commands
@@ -288,7 +290,9 @@ while ($reader->read) {
     my $spaces = $reader->getAttribute('spaces');
     if (defined($spaces)) {
       $spaces =~ s/\\n/\n/g;
+      # convert back formfeed and other special characters
       $spaces =~ s/\\f/\f/g;
+      $spaces =~ s/\\v/\x{000B}/g;
     } else {
       $spaces = '';
     }
[Prev in Thread]
Current Thread
[Next in Thread]
branch master updated: Protect vertical tab in XML, Patrice Dumas <=
Prev by Date: branch master updated: tp/maintain/check_back_xml_forth_texi_t.sh: add arguments to optionally select one directory and one test.
Next by Date: branch master updated: * util/txixml2texi.pl: avoid double comment for line specific command.
Previous by thread: branch master updated: tp/maintain/check_back_xml_forth_texi_t.sh: add arguments to optionally select one directory and one test.
Next by thread: branch master updated: * util/txixml2texi.pl: avoid double comment for line specific command.
Index(es):
- Date
- Thread