koha-cvs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Koha-cvs] CVS: koha/misc/translator TmplToken.pm,1.3,1.4 TmplTokenizer.


From: Ambrose C. LI
Subject: [Koha-cvs] CVS: koha/misc/translator TmplToken.pm,1.3,1.4 TmplTokenizer.pm,1.21,1.22 tmpl_process3.pl,1.4,1.5 xgettext.pl,1.3,1.4
Date: Sun, 22 Feb 2004 13:34:44 -0800

Update of /cvsroot/koha/koha/misc/translator
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv471

Modified Files:
        TmplToken.pm TmplTokenizer.pm tmpl_process3.pl xgettext.pl 
Log Message:
Preliminary support for "analysis" of strings with <a> tags.

Early termination of analysis if we encounter some strings, such as </h1>
or | or ||, in order to avoid extracting strings that are unnecessarily
long and which doesn't add any meaningful context.


Index: TmplToken.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplToken.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** TmplToken.pm        19 Feb 2004 21:24:30 -0000      1.3
--- TmplToken.pm        22 Feb 2004 21:34:40 -0000      1.4
***************
*** 89,92 ****
--- 89,98 ----
  
  # only meaningful for TEXT_PARAMETRIZED tokens
+ sub anchors {
+     my $this = shift;
+     return map { $_->type == TmplTokenType::TAG && $_->string =~ /^<a\b/is? 
$_: ()} @{$this->{'_kids'}};
+ }
+ 
+ # only meaningful for TEXT_PARAMETRIZED tokens
  sub form {
      my $this = shift;

Index: TmplTokenizer.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplTokenizer.pm,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -r1.21 -r1.22
*** TmplTokenizer.pm    22 Feb 2004 09:04:53 -0000      1.21
--- TmplTokenizer.pm    22 Feb 2004 21:34:40 -0000      1.22
***************
*** 421,431 ****
  }
  
! sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token
      my($t) = @_;
!     return $t->type == TmplTokenType::TEXT
        || ($t->type == TmplTokenType::DIRECTIVE
                && $t->string =~ /^(?:$re_tmpl_var)$/os)
        || ($t->type == TmplTokenType::TAG
!               && ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is
                || ($t->string =~ /^<input/i
                    && $t->attributes->{'type'} =~ /^(?:text)$/i)))
--- 421,442 ----
  }
  
! sub _token_groupable1_p ($) { # as first token, groupable into 
TEXT_PARAMETRIZED
      my($t) = @_;
!     return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s)
        || ($t->type == TmplTokenType::DIRECTIVE
                && $t->string =~ /^(?:$re_tmpl_var)$/os)
        || ($t->type == TmplTokenType::TAG
!               && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
!               || ($t->string =~ /^<input/i
!                   && $t->attributes->{'type'} =~ /^(?:text)$/i)))
! }
! 
! sub _token_groupable2_p ($) { # as other token, groupable into 
TEXT_PARAMETRIZED
!     my($t) = @_;
!     return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || 
$t->string !~ /^[\|\s]+$/s))
!       || ($t->type == TmplTokenType::DIRECTIVE
!               && $t->string =~ /^(?:$re_tmpl_var)$/os)
!       || ($t->type == TmplTokenType::TAG
!               && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
                || ($t->string =~ /^<input/i
                    && $t->attributes->{'type'} =~ /^(?:text)$/i)))
***************
*** 440,444 ****
  sub _formalize ($) {
      my($t) = @_;
!     return $t->type == TmplTokenType::DIRECTIVE? '%s': 
_quote_cformat($t->string);
  }
  
--- 451,458 ----
  sub _formalize ($) {
      my($t) = @_;
!     return $t->type == TmplTokenType::DIRECTIVE? '%s':
!          $t->type == TmplTokenType::TAG?
!                  ($t->string =~ /^<a\b/is? '<a>': _quote_cformat($t->string)):
!              _quote_cformat($t->string);
  }
  
***************
*** 453,456 ****
--- 467,471 ----
                }
            };
+     &$undo_trailing_blanks;
      # FIXME: If the last token is a close tag but there are no tags
      # FIXME: before it, drop the close tag back into the queue. This
***************
*** 515,558 ****
        if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
            && ($it->type == TmplTokenType::TEXT?
!               !blank_p( $it->string ): _token_groupable_p( $it ))) {
            my @structure = ( $it );
!           my($n_trailing_spaces, $next) = (0, undef);
!           my($nonblank_text_p, $parametrized_p, $next) = (0, 0);
            if ($it->type == TmplTokenType::TEXT) {
                $nonblank_text_p = 1 if !blank_p( $it->string );
            } elsif ($it->type == TmplTokenType::DIRECTIVE) {
                $parametrized_p = 1;
            }
!           for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) {
                $next = $this->_next_token_intermediate($h);
                push @structure, $next; # for consistency (with initialization)
!           last unless defined $next && _token_groupable_p( $next );
                if ($next->type == TmplTokenType::TEXT) {
!                   if (blank_p( $next->string )) {
!                       $n_trailing_spaces += 1;
!                   } else {
!                       ($n_trailing_spaces, $nonblank_text_p) = (0, 1);
!                   }
                } elsif ($next->type == TmplTokenType::DIRECTIVE) {
-                   $n_trailing_spaces = 0;
                    $parametrized_p = 1;
!               } else {
!                   $n_trailing_spaces = 0;
                }
            }
            # Undo the last token
            push @{$this->{_queue}}, pop @structure;
!           # Undo trailing blank tokens
!           for (my $i = 0; $i < $n_trailing_spaces; $i += 1) {
!               push @{$this->{_queue}}, pop @structure;
!           }
            @structure = $this->_optimize( @structure );
            if (@structure < 2) {
                # Nothing to do
                ;
!           } elsif ($nonblank_text_p && $parametrized_p) {
                # Create the corresponding c-format string
                my $string = join('', map { $_->string } @structure);
                my $form = join('', map { _formalize $_ } @structure);
                $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, 
$it->line_number, $it->pathname);
                $it->set_form( $form );
--- 530,583 ----
        if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
            && ($it->type == TmplTokenType::TEXT?
!               !blank_p( $it->string ): _token_groupable1_p( $it ))) {
            my @structure = ( $it );
!           my @tags = ();
!           my $next = undef;
!           my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0);
            if ($it->type == TmplTokenType::TEXT) {
                $nonblank_text_p = 1 if !blank_p( $it->string );
            } elsif ($it->type == TmplTokenType::DIRECTIVE) {
                $parametrized_p = 1;
+           } elsif ($it->type == TmplTokenType::TAG && $it->string =~ 
/^<([A-Z0-9]+)/is) {
+               push @tags, lc($1);
+               $with_anchor_p = 1 if lc($1) eq 'a';
            }
!           # We hate | and || in msgid strings, so we try to avoid them
!           for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == 
TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
                $next = $this->_next_token_intermediate($h);
                push @structure, $next; # for consistency (with initialization)
!           last unless defined $next && _token_groupable2_p( $next );
!           last if $quit_next_p;
                if ($next->type == TmplTokenType::TEXT) {
!                   $nonblank_text_p = 1 if !blank_p( $next->string );
!                   $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
                } elsif ($next->type == TmplTokenType::DIRECTIVE) {
                    $parametrized_p = 1;
!               } elsif ($next->type == TmplTokenType::TAG) {
!                   if ($next->string =~ /^<([A-Z0-9]+)/is) {
!                       push @tags, lc($1);
!                       $with_anchor_p = 1 if lc($1) eq 'a';
!                   } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
!                       my $close = lc($1);
!                       $quit_p = 1 unless @tags && $close eq $tags[$#tags];
!                       $quit_next_p = 1 if $close =~ /^h\d$/;
!                       pop @tags;
!                   }
                }
+           last if $quit_p;
            }
            # Undo the last token
            push @{$this->{_queue}}, pop @structure;
!           # Simply it a bit more
            @structure = $this->_optimize( @structure );
            if (@structure < 2) {
                # Nothing to do
                ;
!           } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) {
                # Create the corresponding c-format string
                my $string = join('', map { $_->string } @structure);
                my $form = join('', map { _formalize $_ } @structure);
+               my $a_counter = 0;
+               $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
                $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, 
$it->line_number, $it->pathname);
                $it->set_form( $form );
***************
*** 605,612 ****
  
  # Some functions that shouldn't be here... should be moved out some time
! sub parametrize ($@) {
!     my($fmt, @params) = @_;
      my $it = '';
!     for (my $n = 0; length $fmt;) {
        if ($fmt =~ /^[^%]+/) {
            $fmt = $';
--- 630,637 ----
  
  # Some functions that shouldn't be here... should be moved out some time
! sub parametrize ($$$) {
!     my($fmt_0, $params, $anchors) = @_;
      my $it = '';
!     for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
        if ($fmt =~ /^[^%]+/) {
            $fmt = $';
***************
*** 620,624 ****
            $fmt = $';
            if (!defined $width && !defined $prec) {
!               $it .= $params[$i]
            } elsif (defined $width && defined $prec && !$width && !$prec) {
                ;
--- 645,649 ----
            $fmt = $';
            if (!defined $width && !defined $prec) {
!               $it .= $params->[$i]
            } elsif (defined $width && defined $prec && !$width && !$prec) {
                ;
***************
*** 634,637 ****
--- 659,678 ----
        }
      }
+     for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
+       if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
+           $fmt = $';
+           $it .= $&;
+       } elsif ($fmt =~ /^<a(\d+)>/is) {
+           $n += 1;
+           my $i  = $1;
+           $fmt = $';
+           my $anchor = $anchors->[$i - 1];
+           warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef 
#FIXME
+                   unless defined $anchor;
+           $it .= $anchor->string;
+       } else {
+           die "Completely confused decoding anchors: $fmt\n";#XXX
+       }
+     }
      return $it;
  }

Index: tmpl_process3.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/tmpl_process3.pl,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -r1.4 -r1.5
*** tmpl_process3.pl    22 Feb 2004 08:18:27 -0000      1.4
--- tmpl_process3.pl    22 Feb 2004 21:34:40 -0000      1.5
***************
*** 89,96 ****
        } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
            my $fmt = find_translation($s->form);
!           print $output TmplTokenizer::parametrize($fmt, map {
                my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
                $kind == TmplTokenType::TAG && %$attr?
!                   text_replace_tag($t, $attr): $t } $s->parameters);
        } elsif ($kind eq TmplTokenType::TAG && %$attr) {
            print $output text_replace_tag($t, $attr);
--- 89,96 ----
        } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
            my $fmt = find_translation($s->form);
!           print $output TmplTokenizer::parametrize($fmt, [ map {
                my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
                $kind == TmplTokenType::TAG && %$attr?
!                   text_replace_tag($t, $attr): $t } $s->parameters ], [ 
$s->anchors ]);
        } elsif ($kind eq TmplTokenType::TAG && %$attr) {
            print $output text_replace_tag($t, $attr);
***************
*** 298,302 ****
  
  This is an experimental version of the tmpl_process.pl script,
! using standard gettext-style PO files.
  
  Currently, the create, update, and install actions have all been
--- 298,303 ----
  
  This is an experimental version of the tmpl_process.pl script,
! using standard gettext-style PO files.  Note that the behaviour
! of this script should still be considered unstable.
  
  Currently, the create, update, and install actions have all been
***************
*** 316,321 ****
  The --help option has not been implemented yet.
  
! There are probably some real bugs too, since this has not been
! tested very much.
  
  xgettext.pl must be present in the current directory; the
--- 317,328 ----
  The --help option has not been implemented yet.
  
! If an extracted string contain actual text (versus tags or
! TMPL_VAR directives), the strings are extracted verbatim,
! resulting in unwieldy things like multiple spaces, tabs,
! and/or newlines which are semantically indistinguishable
! from single blanks. If the template writer changes the
! spacing just a little bit, the new formatting would be
! considered new strings. This is arguably wrong, and in any
! case counter-productive.
  
  xgettext.pl must be present in the current directory; the
***************
*** 332,335 ****
--- 339,345 ----
  (e.g., to get rid of the "Strange line" warning for #~).
  
+ There are probably some other bugs too, since this has not been
+ tested very much.
+ 
  =head1 SEE ALSO
  

Index: xgettext.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/xgettext.pl,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** xgettext.pl 22 Feb 2004 06:46:17 -0000      1.3
--- xgettext.pl 22 Feb 2004 21:34:40 -0000      1.4
***************
*** 22,37 ****
  
###############################################################################
  
! sub negligible_p ($) {
      my($t) = @_;                              # a string
      # Don't emit pure whitespace, pure numbers, pure punctuation,
      # single letters, or TMPL_VAR's.
      # Punctuation should arguably be translated. But without context
!     # they are untranslatable.
      return !$extract_all_p && (
!              TmplTokenizer::blank_p($t)               # blank or TMPL_VAR
            || $t =~ /^\d+$/                    # purely digits
!           || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context
            || $t =~ /^[A-Za-z]$/               # single letters
!       );
  }
  
--- 22,51 ----
  
###############################################################################
  
! sub string_negligible_p ($) {
      my($t) = @_;                              # a string
      # Don't emit pure whitespace, pure numbers, pure punctuation,
      # single letters, or TMPL_VAR's.
      # Punctuation should arguably be translated. But without context
!     # they are untranslatable. Note that $t is a string, not a token object.
      return !$extract_all_p && (
!              TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
            || $t =~ /^\d+$/                    # purely digits
!           || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
            || $t =~ /^[A-Za-z]$/               # single letters
!       )
! }
! 
! sub token_negligible_p( $ ) {
!     my($x) = @_;
!     my $t = $x->type;
!     return !$extract_all_p && (
!           $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
!           $t == TmplTokenType::DIRECTIVE? 1:
!           $t == TmplTokenType::TEXT_PARAMETRIZED
!               && join( '', map { my $t = $_->type;
!                       $t == TmplTokenType::DIRECTIVE?
!                               '1': $t == TmplTokenType::TAG?
!                                       '': token_negligible_p( $_ )?
!                                       '': '1' } @{$x->children} ) eq '' );
  }
  
***************
*** 40,45 ****
  sub remember ($$) {
      my($token, $string) = @_;
!     $text{$string} = [] unless defined $text{$string};
!     push @{$text{$string}}, $token;
  }
  
--- 54,62 ----
  sub remember ($$) {
      my($token, $string) = @_;
!     # If we determine that the string is negligible, don't bother to remember
!     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
!       $text{$string} = [] unless defined $text{$string};
!       push @{$text{$string}}, $token;
!     }
  }
  
***************
*** 70,77 ****
        my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
        if ($kind eq TmplTokenType::TEXT) {
-           #$t = TmplTokenizer::trim $t;
            remember( $s, $t ) if $t =~ /\S/s;
        } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
-           #$t = TmplTokenizer::trim $t;
            remember( $s, $s->form ) if $s->form =~ /\S/s;
        } elsif ($kind eq TmplTokenType::TAG && %$attr) {
--- 87,92 ----
***************
*** 97,101 ****
      # Emit all extracted strings.
      for my $t (string_list) {
!       printf OUTPUT "%s\n", $t unless negligible_p($t);
      }
  }
--- 112,116 ----
      # Emit all extracted strings.
      for my $t (string_list) {
!       printf OUTPUT "%s\n", $t # unless negligible_p($t);
      }
  }
***************
*** 128,132 ****
      my $directory_re = quotemeta("$directory/");
      for my $t (string_list) {
!       next if negligible_p($t);
        my $cformat_p;
        for my $token (@{$text{$t}}) {
--- 143,147 ----
      my $directory_re = quotemeta("$directory/");
      for my $t (string_list) {
!       #next if negligible_p($t);
        my $cformat_p;
        for my $token (@{$text{$t}}) {
***************
*** 317,331 ****
  =back
  
! Right now it does about the same thing as text-extract2.pl but
! generates gettext-style output; however, because it is scanner-
! instead of parser-based, it is able to address the 4 weaknesses
! listed in translator_doc.txt.  Ultimately, the goal is to make
! this able to do some kind of simple analysis on the input to
! produce gettext-style output with c-format strings, in order to
! facilitate translation to languages with a different word order
! than English.
  
! When the above is finished, the generated po file may contain
! some HTML tags in addition to %s strings.
  
  If you want to generate GNOME-style POTFILES.in files, such
--- 332,340 ----
  =back
  
! Note that this script is experimental and should still be
! considered unstable.
  
! Please refer to the explanation in tmpl_process3 for further
! details.
  
  If you want to generate GNOME-style POTFILES.in files, such




reply via email to

[Prev in Thread] Current Thread [Next in Thread]