sdx-developers
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/


From: Pierre Dittgen
Subject: Re: [sdx-developers] Intégration à SDX2. 2 de la recherche des chiffres/nombres dans le texte ?
Date: Thu, 22 Apr 2004 11:37:31 +0200
User-agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.5) Gecko/20031007


Pas de mon côté. Tu peux envoyer un patch ?


Le voilà.

Pierre
--
Pierre Dittgen, address@hidden
PASS Technologie http://www.pass-tech.fr
diff -urN analysis/Analyzer_br.java analysis.new/Analyzer_br.java
--- analysis/Analyzer_br.java   2004-04-22 11:35:06.251500000 +0200
+++ analysis.new/Analyzer_br.java       2004-04-13 17:51:53.546875000 +0200
@@ -30,16 +30,16 @@
 package fr.gouv.culture.sdx.search.lucene.analysis;
 
 import fr.gouv.culture.sdx.search.lucene.analysis.filter.BrazilianStemFilter;
+import 
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LetterOrDigitTokenizer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.de.WordlistLoader;
 import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 import java.io.File;
-import java.io.IOException;
 import java.io.Reader;
+import java.io.IOException;
 import java.util.Hashtable;
 
 /*
@@ -103,7 +103,7 @@
      * Builds an analyzer with the given stop words.
      */
     public Analyzer_br(File stopwords) throws IOException {
-        super.stopTable = WordlistLoader.getWordtable(stopwords);
+            super.stopTable = WordlistLoader.getWordtable(stopwords);
     }
 
     /**
@@ -134,7 +134,7 @@
      *                         StandardFilter, StopFilter, GermanStemFilter 
and LowerCaseFilter.
      */
     public final TokenStream tokenStream(String fieldName, Reader reader) {
-        TokenStream result = new StandardTokenizer(reader);
+        TokenStream result = new LetterOrDigitTokenizer(reader);
         result = new StandardFilter(result);
         result = new StopFilter(result, super.stopTable);
         result = new BrazilianStemFilter(result, super.excludeTable);
diff -urN analysis/Analyzer_cz.java analysis.new/Analyzer_cz.java
--- analysis/Analyzer_cz.java   2004-04-22 11:35:06.407750000 +0200
+++ analysis.new/Analyzer_cz.java       2004-04-13 18:01:20.484375000 +0200
@@ -54,12 +54,12 @@
  * <http://www.apache.org/>.
  */
 
+import 
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LetterOrDigitTokenizer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.de.WordlistLoader;
 import org.apache.lucene.analysis.standard.StandardFilter;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 import java.io.*;
 import java.util.Hashtable;
@@ -166,7 +166,7 @@
      *                         StandardFilter, StopFilter, GermanStemFilter 
and LowerCaseFilter
      */
     public final TokenStream tokenStream(String fieldName, Reader reader) {
-        TokenStream result = new StandardTokenizer(reader);
+        TokenStream result = new LetterOrDigitTokenizer(reader);
         result = new StandardFilter(result);
         result = new LowerCaseFilter(result);
         result = new StopFilter(result, stopTable);
diff -urN analysis/Analyzer_fr.java analysis.new/Analyzer_fr.java
--- analysis/Analyzer_fr.java   2004-04-22 11:35:06.673375000 +0200
+++ analysis.new/Analyzer_fr.java       2004-04-13 17:19:06.390625000 +0200
@@ -31,12 +31,12 @@
 
 import fr.gouv.culture.sdx.search.lucene.analysis.filter.FrenchStandardFilter;
 import fr.gouv.culture.sdx.search.lucene.analysis.filter.ISOLatin1AccentFilter;
+import 
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LetterOrDigitTokenizer;
 import org.apache.avalon.framework.configuration.Configuration;
 import org.apache.avalon.framework.configuration.ConfigurationException;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
 
 import java.io.Reader;
 
@@ -94,7 +94,7 @@
         TokenStream result;
 
         // Builds the chain...
-        result = new StandardTokenizer(reader);
+        result = new LetterOrDigitTokenizer(reader);
 
         FrenchStandardFilter fsf = new FrenchStandardFilter();
         fsf.enableLogging(logger);
diff -urN analysis/CVS/Entries analysis.new/CVS/Entries
--- analysis/CVS/Entries        2004-04-22 11:35:07.126500000 +0200
+++ analysis.new/CVS/Entries    2004-04-13 16:44:54.296875000 +0200
@@ -1,18 +1,18 @@
+/AbstractAnalyzer.java/1.11/Wed Mar 24 18:26:17 2004//
+/Analyzer.java/1.7/Thu Feb  6 14:10:08 2003//
+/AnalyzerManager.java/1.20/Fri Mar 26 15:26:37 2004//
+/Analyzer_ar.java/1.5/Wed Mar 24 18:26:17 2004//
+/Analyzer_br.java/1.2/Tue Apr  6 19:01:15 2004//
+/Analyzer_cn.java/1.4/Thu Feb  6 14:10:08 2003//
+/Analyzer_cz.java/1.4/Tue Apr  6 19:01:15 2004//
+/Analyzer_de.java/1.3/Mon Jan 12 15:07:40 2004//
+/Analyzer_en.java/1.1/Sun May 26 21:30:10 2002//
+/Analyzer_fr.java/1.13/Thu Feb  6 14:10:08 2003//
+/Analyzer_ru.java/1.3/Mon Jan 19 11:56:20 2004//
+/DefaultAnalyzer.java/1.13/Mon Jan 12 15:07:40 2004//
+/Glosser_ar_en.java/1.5/Wed Mar 24 18:26:17 2004//
+/MetaAnalyzer.java/1.10/Wed Mar 24 18:26:17 2004//
+/package.html/1.2/Tue Aug 27 16:50:19 2002//
 D/filter////
 D/stemmer////
 D/tokenizer////
-/AbstractAnalyzer.java/1.11/Thu Apr 22 09:35:05 2004//
-/Analyzer.java/1.7/Thu Apr 22 09:35:05 2004//
-/AnalyzerManager.java/1.20/Thu Apr 22 09:35:05 2004//
-/Analyzer_ar.java/1.5/Thu Apr 22 09:35:05 2004//
-/Analyzer_br.java/1.3/Thu Apr 22 09:35:06 2004//
-/Analyzer_cn.java/1.4/Thu Apr 22 09:35:06 2004//
-/Analyzer_cz.java/1.4/Thu Apr 22 09:35:06 2004//
-/Analyzer_de.java/1.3/Thu Apr 22 09:35:06 2004//
-/Analyzer_en.java/1.1/Thu Apr 22 09:35:06 2004//
-/Analyzer_fr.java/1.13/Thu Apr 22 09:35:06 2004//
-/Analyzer_ru.java/1.3/Thu Apr 22 09:35:06 2004//
-/DefaultAnalyzer.java/1.13/Thu Apr 22 09:35:06 2004//
-/Glosser_ar_en.java/1.5/Thu Apr 22 09:35:07 2004//
-/MetaAnalyzer.java/1.11/Thu Apr 22 09:35:07 2004//
-/package.html/1.2/Thu Apr 22 09:35:07 2004//
diff -urN analysis/CVS/Entries.Extra analysis.new/CVS/Entries.Extra
--- analysis/CVS/Entries.Extra  2004-04-22 11:35:07.126500000 +0200
+++ analysis.new/CVS/Entries.Extra      2004-04-13 16:44:54.296875000 +0200
@@ -1,6 +1,3 @@
-D/filter///
-D/stemmer///
-D/tokenizer///
 /AbstractAnalyzer.java///
 /Analyzer.java///
 /AnalyzerManager.java///
@@ -16,3 +13,6 @@
 /Glosser_ar_en.java///
 /MetaAnalyzer.java///
 /package.html///
+D/filter///
+D/stemmer///
+D/tokenizer///
diff -urN analysis/CVS/Entries.Log analysis.new/CVS/Entries.Log
--- analysis/CVS/Entries.Log    2004-04-22 11:35:09.735875000 +0200
+++ analysis.new/CVS/Entries.Log        1970-01-01 01:00:00.000000000 +0100
@@ -1,3 +0,0 @@
-A D/filter////
-A D/stemmer////
-A D/tokenizer////
diff -urN analysis/DefaultAnalyzer.java analysis.new/DefaultAnalyzer.java
--- analysis/DefaultAnalyzer.java       2004-04-22 11:35:06.985875000 +0200
+++ analysis.new/DefaultAnalyzer.java   2004-04-13 17:19:41.468750000 +0200
@@ -30,9 +30,9 @@
 package fr.gouv.culture.sdx.search.lucene.analysis;
 
 import fr.gouv.culture.sdx.exception.SDXException;
+import 
fr.gouv.culture.sdx.search.lucene.analysis.tokenizer.LaxistLowerCaseTokenizer;
 import org.apache.avalon.framework.configuration.Configuration;
 import org.apache.avalon.framework.configuration.ConfigurationException;
-import org.apache.lucene.analysis.LowerCaseTokenizer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 
@@ -129,9 +129,9 @@
     /** Filters LowerCaseTokenizer with StopFilter. */
     public TokenStream tokenStream(String fieldName, Reader reader) {
         if (stopTable != null)
-            return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
+            return new StopFilter(new LaxistLowerCaseTokenizer(reader), 
stopTable);
         else
-            return new LowerCaseTokenizer(reader);
+            return new LaxistLowerCaseTokenizer(reader);
     }
 
     /**
diff -urN analysis/MetaAnalyzer.java analysis.new/MetaAnalyzer.java
--- analysis/MetaAnalyzer.java  2004-04-22 11:35:07.017125000 +0200
+++ analysis.new/MetaAnalyzer.java      2004-03-24 19:26:17.000000000 +0100
@@ -32,7 +32,7 @@
 import fr.gouv.culture.sdx.exception.SDXException;
 import fr.gouv.culture.sdx.exception.SDXExceptionCode;
 import fr.gouv.culture.sdx.search.lucene.Field;
-import fr.gouv.culture.sdx.search.lucene.FieldList;
+import fr.gouv.culture.sdx.search.lucene.FieldsDefinition;
 import org.apache.lucene.analysis.TokenStream;
 
 import java.io.Reader;
@@ -49,7 +49,7 @@
 public class MetaAnalyzer extends AbstractAnalyzer {
 
     /** The fields definition object. */
-    private FieldList fields;
+    private FieldsDefinition fields;
 
     /** The default analzyer to use. */
     private Analyzer defaultAnalyzer;
@@ -70,7 +70,7 @@
      *
      * @param   fields  The fields and their definitions (cannot be null).
      */
-    public void setUp(FieldList fields) throws SDXException {
+    public void setUp(FieldsDefinition fields) throws SDXException {
 
         if (fields == null) throw new SDXException(logger, 
SDXExceptionCode.ERROR_FIELDS_DEF_NULL, null, null);
         this.fields = fields;
@@ -106,9 +106,9 @@
         return theAnalyzer.tokenStream(fieldName, reader);
     }
 
-    /** Returns a the FieldList for this MetaAnalyzer (basically a Hashtable 
of all the Fields)*/
+    /** Returns a the FieldsDefinition for this MetaAnalyzer (basically a 
Hashtable of all the Fields)*/
     //TODO?:is this still necessary, as it exists both in LuceneIndex and 
MetaAnalyzer?-rbp
-    public FieldList getFieldList() {
+    public FieldsDefinition getFieldsDefinition() {
         return this.fields;
     }
 
diff -urN analysis/tokenizer/LaxistLowerCaseTokenizer.java 
analysis.new/tokenizer/LaxistLowerCaseTokenizer.java
--- analysis/tokenizer/LaxistLowerCaseTokenizer.java    1970-01-01 
01:00:00.000000000 +0100
+++ analysis.new/tokenizer/LaxistLowerCaseTokenizer.java        2004-04-02 
14:45:48.000000000 +0200
@@ -0,0 +1,62 @@
+/*
+SDX: Documentary System in XML.
+Copyright (C) 2000, 2001, 2002  Ministere de la culture et de la communication 
(France), AJLSM
+
+Ministere de la culture et de la communication,
+Mission de la recherche et de la technologie
+3 rue de Valois, 75042 Paris Cedex 01 (France)
address@hidden, address@hidden
+
+AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
address@hidden
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the
+Free Software Foundation, Inc.
+59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
+or connect to:
+http://www.fsf.org/copyleft/gpl.html
+*/
+/*
+ * Created by Vim :-)
+ * User: Pierre Dittgen
+ * Date: 2 apr. 2004
+ */
+package fr.gouv.culture.sdx.search.lucene.analysis.tokenizer;
+
+// Jdk import
+import java.io.Reader;
+
+/**
+ * Title: LaxistLowerCaseTokenizer
+ * Description: Like org.apache.lucene.analysis.LowerCaseTokenizer but
+ * inherits from LetterOrDigitTokenizer, not from LetterTokenizer
+ * Copyright:   Copyright (c) 2004
+ * Company:
+ * @author Pierre Dittgen
+ * @version 1.0
+ *
+ */
+public final class LaxistLowerCaseTokenizer extends LetterOrDigitTokenizer
+{
+       public LaxistLowerCaseTokenizer(Reader in)
+       {
+               super(in);
+       }
+
+       protected char normalize(char c)
+       {
+               return Character.toLowerCase(c);
+       }
+}
+
diff -urN analysis/tokenizer/LetterOrDigitTokenizer.java 
analysis.new/tokenizer/LetterOrDigitTokenizer.java
--- analysis/tokenizer/LetterOrDigitTokenizer.java      1970-01-01 
01:00:00.000000000 +0100
+++ analysis.new/tokenizer/LetterOrDigitTokenizer.java  2004-04-02 
14:52:42.000000000 +0200
@@ -0,0 +1,68 @@
+/*
+SDX: Documentary System in XML.
+Copyright (C) 2000, 2001, 2002  Ministere de la culture et de la communication 
(France), AJLSM
+
+Ministere de la culture et de la communication,
+Mission de la recherche et de la technologie
+3 rue de Valois, 75042 Paris Cedex 01 (France)
address@hidden, address@hidden
+
+AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
address@hidden
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the
+Free Software Foundation, Inc.
+59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
+or connect to:
+http://www.fsf.org/copyleft/gpl.html
+*/
+/*
+ * Created by Vim :-)
+ * User: Pierre Dittgen
+ * Date: 2 apr. 2004
+ */
+package fr.gouv.culture.sdx.search.lucene.analysis.tokenizer;
+
+// Lucene import
+import org.apache.lucene.analysis.CharTokenizer;
+
+// Jdk import
+import java.io.Reader;
+
+
+/**
+ * Title: LetterOrDigitTokenizer
+ * Description: Like org.apache.lucene.analysis.LetterTokenizer but also
+ * accept digits
+ * Copyright:   Copyright (c) 2004
+ * Company:
+ * @author Pierre Dittgen
+ * @version 1.0
+ *
+ */
+public class LetterOrDigitTokenizer extends CharTokenizer {
+
+
+    public LetterOrDigitTokenizer(Reader in)
+       {
+        super(in);
+    }
+
+       protected boolean isTokenChar(char c)
+       {
+               return Character.isLetterOrDigit(c);
+       }
+
+}
+

reply via email to

[Prev in Thread] Current Thread [Next in Thread]