# -*- coding: utf-8 -*- # DictionaryIndex # Copyright 2008-2010 Vasudev Kamath ,Santhosh Thottingal # http://www.smc.org.in # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import sys import os import timeit import codecs class DictionaryIndex: def __init__(self): self.fp = None self.op = None self.dictionary = dict() self.offset = 0 self.path = os.path.join(os.path.dirname(__file__),"dicts") self.dictionary_file = None self.index_file = None def createIndex(self,dictfile): """ Creates index for a dictionary index file is created in form of dictionary object and creates a file by name dictfile.index with contents in following format A=1 B=2000 .... (For eg. en_US.index) @param dictfile : name of the dictionary for which index should be created """ self.dictionary_file = os.path.join(self.path,dictfile) self.index_file = os.path.join(self.path,dictfile.split(".")[0] + ".index") self.fp = codecs.open(self.dictionary_file,"r",encoding="utf-8") self.op = codecs.open(self.index_file,"w",encoding="utf-8") # loop untill entire file is not finished while True: item = self.fp.readline() if not item: break # if the alphabet is currently not indexed then index it # with current value of byte offset else increase the offset # by the byte length of currently read word till you get # new alphaet which is not indexed if len(item)>0 and not self.dictionary.has_key(item[0]): self.dictionary[item[0]] = self.offset self.offset = self.offset + len(item.encode( "utf-8" )) #print "Index for " + self.dictionary_file + " is created " for index in self.dictionary: value = self.dictionary.get(index,None) if not value == None: self.op.write(index + "=%d\n"% value) # Clean up self.fp.close() self.op.close() def loadIndexFor(self,dictfile): """ This function reads the index file and loads the content into a dictionary object. If file doesn't exist this will create the index file and then reads it. @param dictfile: Dictionary for which the index file is to be loaded returns - dictionary object containing indexing information """ self.index_file = os.path.join(self.path,dictfile.split(".")[0] + ".index") try: self.fp = codecs.open(self.index_file,"r",encoding="utf-8",errors="ignore") except IOError: print "ioerror" self.createIndex(dictfile) self.fp = codecs.open(self.index_file,"r",encoding="utf-8") self.dictionary = {} while True: text = unicode(self.fp.readline()) if text: line = text.split("=") if len(line)==2: index = line[0] value = line[1] self.dictionary[index] = value else: break self.fp.close() return self.dictionary if __name__ == "__main__": index = DictionaryIndex() t1 = timeit.Timer() #index.createIndex("ml_IN.dic") #index.createIndex("kn_IN.dic") #index.createIndex("en_US.dic") #index.createIndex("gu_IN.dic") index.createIndex("mr_IN.dic") #index.createIndex("ta_IN.dic") #index.createIndex("or_IN.dic") #index.createIndex("pa_IN.dic") #index.createIndex("bn_IN.dic") #index.createIndex("bn_BD.dic") print t1.timeit() #t2 = timeit.Timer() #dic = index.loadIndexFor("kn_IN.dic") #print t2.timeit() #print dic #load the content at index position for a given letter #offset=int(dic[u'ಕ']) #print offset #fp = codecs.open("dicts/kn_IN.dic","r",encoding="utf-8") #fp.seek(offset) #print fp.readline()