;; guile-gotofish ;; ;; Copyright (C) 2019 Amirouche Boubekki ;; ;; This library is free software; you can redistribute it and/or ;; modify it under the terms of the GNU Lesser General Public ;; License as published by the Free Software Foundation; either ;; version 3 of the License, or (at your option) any later version. ;; ;; This library is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; Lesser General Public License for more details. ;; ;; You should have received a copy of the GNU Lesser General Public License ;; along with this library; if not, write to the Free Software Foundation, ;; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. (export string->stem) (import (scheme base)) (import (scheme hash-table)) (import (scheme comparator)) (import (scheme generator)) (import (scheme process-context)) (import (scheme mapping hash)) (import (scheme set)) (import (ice-9 match)) (import (snowball-stemmer)) (import (wiredtiger okvs)) (import (wiredtiger nstore)) (define comparator (make-default-comparator)) (define punctuation (make-hash-table comparator)) (let loop ((chars (string->list "!\"#$%&\\'()*+,-./:;<=>address@hidden|}~\n\t"))) (unless (null? chars) (hash-table-set! punctuation (car chars) #t) (loop (cdr chars)))) (define (clean string) "Replace punctuation characters from STRING with a space character" (string-map (lambda (char) (if (hash-table-ref punctuation char (const #f)) #\space char)) string)) (define split (lambda (x) (string-split x #\space))) ;; setup stopwords (define %stopwords "a able about above abst accordance according accordingly across act actually added adj affected affecting affects after afterwards again against ah ain't all allow allows almost alone along already also although always am among amongst an and announce another any anybody anyhow anymore anyone anything anyway anyways anywhere apart apparently appear appreciate appropriate approximately are aren arent aren't arise around as a's aside ask asking associated at auth available away awfully b back be became because become becomes becoming been before beforehand begin beginning beginnings begins behind being believe below beside besides best better between beyond biol both brief briefly but by c ca came can cannot cant can't cause causes certain certainly changes clearly c'mon co com come comes concerning consequently consider considering contain containing contains corresponding could couldnt couldn't course c's currently d date definitely described despite did didn't different do does doesn't doing done don't down downwards due during e each ed edu effect eg eight eighty either else elsewhere end ending enough entirely especially et et-al etc even ever every everybody everyone everything everywhere ex exactly example except f far few ff fifth first five fix followed following follows for former formerly forth found four from further furthermore g gave get gets getting give given gives giving go goes going gone got gotten greetings h had hadn't happens hardly has hasn't have haven't having he hed he'd he'll hello hence her here hereafter hereby herein heres here's hereupon hers herself hes he's hi hid him himself his hither home hopefully how howbeit however how's hundred i I id i'd ie if ignored i'll im i'm immediate immediately importance important in inasmuch inc indeed index indicate indicated indicates information inner insofar instead into invention inward is isn't it itd it'd it'll its it's itself i've j just k keep keep keeps keeps kept kg km know known knows l largely last lately later latter latterly least less lest let lets let's like liked likely line little 'll look looking looks ltd m made mainly make makes many may maybe me mean means meantime meanwhile merely mg might million miss ml more moreover most mostly mr mrs much mug must mustn't my myself n na name namely nay nd near nearly necessarily necessary need needs neither never nevertheless new next nine ninety no nobody non none nonetheless noone nor normally nos not noted nothing novel now nowhere o obtain obtained obviously of off often oh ok okay old omitted on once one ones only onto or ord other others otherwise ought our ours ourselves out outside over overall owing own p page pages part particular particularly past per perhaps placed please plus poorly possible possibly potentially pp predominantly present presumably previously primarily probably promptly proud provides put q que quickly quite qv r ran rather rd re readily really reasonably recent recently ref refs regarding regardless regards related relatively research respectively resulted resulting results right run s said same saw say saying says sec second secondly section see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall shan't she shed she'd she'll shes she's should shouldn't show showed shown showns shows significant significantly similar similarly since six slightly so some somebody somehow someone somethan something sometime sometimes somewhat somewhere soon sorry specifically specified specify specifying still stop strongly sub substantially successfully such sufficiently suggest sup sure sure t take taken taking tell tends th than thank thanks thanx that that'll thats that's that've the their theirs them themselves then thence there thereafter thereby thered therefore therein there'll thereof therere theres there's thereto thereupon there've these they theyd they'd they'll theyre they're they've think third this thorough thoroughly those thou though thoughh thousand three throug through throughout thru thus til tip to together too took toward towards tried tries truly try trying ts t's twice two u un under unfortunately unless unlike unlikely until unto up upon ups us use used useful usefully usefulness uses using usually v value various 've very via viz vol vols vs w want wants was wasnt wasn't way we wed we'd welcome well we'll went were we're werent weren't we've what whatever what'll whats what's when whence whenever when's where whereafter whereas whereby wherein wheres where's whereupon wherever whether which while whim whither who whod whoever whole who'll whom whomever whos who's whose why why's widely will willing wish with within without wonder wont won't words world would wouldnt wouldn't www x y yes yet you youd you'd you'll your youre you're yours yourself yourselves you've z zero ") (define stopwords (make-hash-table comparator)) (let loop ((words (split (clean %stopwords)))) (unless (null? words) (hash-table-set! stopwords (car words) #t) (loop (cdr words)))) (define (filter-stopwords lst) (filter (lambda (token) (not (hash-table-ref stopwords token (const #f)))) lst)) ;; string->stems (define (sanitize words) "Only keep words that have length bigger than one" (filter (lambda (word) (< 1 (string-length word))) words)) (define english (make-stemmer "english")) (define string->stems ;; TODO: uniquify (compose (lambda (words) (map (lambda (word) (stem english word)) words)) filter-stopwords sanitize split string-downcase clean)) ;;; helpers (define-public (scm->string scm) (call-with-output-string (lambda (port) (write scm port)))) (define-public (string->scm string) (call-with-input-string string read)) ;; okvs & nstore helpers (define *bigish* (expt 2 256)) (define (some? transaction nstore collection uid) (let ((generator (nstore-from transaction nstore (list collection uid (nstore-var 'key) (nstore-var 'value))))) (not (eof-object? (generator))))) (define random-uid (lambda (transaction nstore collection) (let loop ((index 3)) (if (zero? index) (raise (cons 'gotofish "Ooops! No more random uid!")) (let ((candidate (number->string (random *bigish*) 36))) (if (some? transaction nstore collection candidate) (loop (- index 1)) candidate)))))) ;; database setup (define (triplestore) (let ((engine (nstore-engine okvs-ref okvs-set! okvs-rm! okvs-prefix))) (nstore engine #vu8(01) '(uid key value)))) ;; TODO: mkdir directory (define home (string-append (getenv "HOME") "/.gotofish")) (define okvs (okvs `((home . ,home) (create? . #t) (wal? . #t)))) (define store (triplestore)) ;; index (define (read-lines) (let loop ((line (read-line)) (out '())) (if (eof-object? line) (string-join out " ") (loop (read-line) (cons line out))))) (define add! (okvs-transactional (lambda (transaction store uid stem) (nstore-add! transaction store (list uid 'stem stem))))) (define (index uid) (let ((document (read-lines))) (let loop ((stems (string->stems document))) (unless (null? stems) (add! okvs store uid (car stems)) (loop (cdr stems))))) (close (current-input-port)) (format #t "Done!\n")) ;; search (define %lookup (okvs-transactional (lambda (transaction store stem) (apply set comparator (map (lambda (x) (hashmap-ref x 'uid (const #f))) (generator->list (nstore-from transaction store (list (nstore-var 'uid) 'stem stem)))))))) (define (lookup keyword) (%lookup okvs store (stem english (string-downcase keyword)))) (define (search seed keywords) (let loop ((keywords keywords) (out (lookup seed))) (if (null? keywords) (for-each (lambda (x) (format #t "** ~a\n" x)) (set->list out)) (loop (cdr keywords) (set-intersection (lookup (car keywords)) out))))) ;; exec (match (cdr (command-line)) (("index" uid) (index uid)) (("search" keyword keywords ...) (search keyword keywords)) (else (format #t "I do not understand what you want!\n"))) (okvs-close okvs)