Improve kanjidic and jmdict formats
modules/nani/kanji/kanjidic.scm
| 132 | 132 | ||
| 133 | 133 | (define (update-trie-pos! trie kanji) | |
| 134 | 134 | (let* ((vals (trie-vals trie)) | |
| 135 | - | (vals (map (lambda (i) (kanji-position (array-ref kanji i))) vals))) | |
| 135 | + | (vals (match vals | |
| 136 | + | ((pos) (kanji-position (array-ref kanji pos))) | |
| 137 | + | (() 0)))) | |
| 136 | 138 | (trie-vals-set! trie vals)) | |
| 137 | 139 | (for-each | |
| 138 | 140 | (match-lambda | |
… | |||
| 173 | 175 | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
| 174 | 176 | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
| 175 | 177 | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
| 176 | - | (serialize-trie (serialize-trie serialize-int int-size)) | |
| 177 | - | (trie-size (trie-size int-size)) | |
| 178 | + | (serialize-trie (serialize-trie-single serialize-int int-size)) | |
| 179 | + | (trie-size (trie-size-single int-size)) | |
| 178 | 180 | (kanji-trie (make-kanji-trie kanji)) | |
| 179 | 181 | (kanji-trie-size (trie-size kanji-trie)) | |
| 180 | 182 | (results-size | |
modules/nani/pitch/pitch.scm
| 58 | 58 | pitches) | |
| 59 | 59 | (compress-trie trie))) | |
| 60 | 60 | ||
| 61 | + | (define (uniq lst) | |
| 62 | + | (let loop ((lst lst) (res '())) | |
| 63 | + | (match lst | |
| 64 | + | (() res) | |
| 65 | + | ((elem lst ...) | |
| 66 | + | (if (member elem res) | |
| 67 | + | (loop lst res) | |
| 68 | + | (loop lst (cons elem res))))))) | |
| 69 | + | ||
| 61 | 70 | (define (collapse-vals! trie) | |
| 62 | 71 | (let ((transitions (trie-transitions trie)) | |
| 63 | 72 | (vals (map (match-lambda ((? string? s) s) ((? number? i) (number->string i))) | |
| 64 | - | (trie-vals trie)))) | |
| 73 | + | (uniq (trie-vals trie))))) | |
| 65 | 74 | (trie-vals-set! trie (string-join vals ", ")) | |
| 66 | 75 | (for-each collapse-vals! (map cdr transitions)))) | |
| 67 | 76 |
modules/nani/result/jibiki.scm
| 293 | 293 | ((equal? elem-gi 'volume) | |
| 294 | 294 | results) | |
| 295 | 295 | ((equal? elem-gi 'article) | |
| 296 | - | (set! results (cons seed results)) | |
| 296 | + | (let ((entry (sxml->result seed frq))) | |
| 297 | + | (set! results (cons entry results))) | |
| 297 | 298 | #f) | |
| 298 | 299 | (else | |
| 299 | 300 | (let* ((seed (reverse seed)) |
modules/nani/result/jmdict.scm
| 426 | 426 | ((equal? elem-gi 'JMdict) | |
| 427 | 427 | results) | |
| 428 | 428 | ((equal? elem-gi 'entry) | |
| 429 | - | (set! results (cons seed results)) | |
| 429 | + | (let ((entry (sxml->result seed frq))) | |
| 430 | + | (set! results (cons entry results))) | |
| 430 | 431 | #f) | |
| 431 | 432 | (else | |
| 432 | 433 | (let* ((seed (reverse seed)) |
modules/nani/result/wadoku.scm
| 351 | 351 | (sub-loop loop infos result lst l))))) | |
| 352 | 352 | ||
| 353 | 353 | (define (sxml->element lst elem frq) | |
| 354 | - | (let ((elem (match elem | |
| 355 | - | ((_ . elem) elem) | |
| 356 | - | (_ elem)))) | |
| 357 | - | (match elem | |
| 358 | - | ('orth (let ((kanji (filter string? lst))) | |
| 359 | - | (if (null? kanji) | |
| 360 | - | #f | |
| 361 | - | `(kanji | |
| 362 | - | ,(string-filter | |
| 363 | - | (lambda (c) | |
| 364 | - | ;; Remove characters that interfere with kanji | |
| 365 | - | ;; based search | |
| 366 | - | (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\??? | |
| 367 | - | #\?? #\???)))) | |
| 368 | - | (car kanji)))))) | |
| 369 | - | ('count #f) | |
| 370 | - | ('entry (sxml->result lst frq)) | |
| 371 | - | ('hira `(reading ,(car lst))) | |
| 372 | - | ('hatsuon `(hatsuon ,(car lst))) | |
| 373 | - | ('accent `(pitch ,(car lst))) | |
| 374 | - | ('reading (sxml->reading lst)) | |
| 375 | - | ('form lst) | |
| 376 | - | ('impli `(impli ,(car lst))) | |
| 377 | - | ('text (let loop ((text "") (lst lst)) | |
| 378 | - | (match lst | |
| 379 | - | (() text) | |
| 380 | - | ((('hasPrecedingSpace . _) lst ...) | |
| 381 | - | (loop (string-append " " text) lst)) | |
| 382 | - | ((('hasFollowingSpace . _) lst ...) | |
| 383 | - | (string-append (loop text lst) " ")) | |
| 384 | - | (((? string? s) lst ...) | |
| 385 | - | (loop (string-append text s) lst))))) | |
| 386 | - | ('famn (sxml->string lst)) | |
| 387 | - | ('expl (sxml->string lst)) | |
| 388 | - | ('expli (filter list? lst)) | |
| 389 | - | ('abbrev (filter list? lst)) | |
| 390 | - | ('token (filter string? lst)) | |
| 391 | - | ('tr (sxml->string lst)) | |
| 392 | - | ('transcr `(transcr ,lst)) | |
| 393 | - | ('trans | |
| 394 | - | (append (filter list? lst) | |
| 395 | - | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
| 396 | - | ('jap `(jap ,lst)) | |
| 397 | - | ('emph lst) | |
| 398 | - | ('title (filter string? lst)) | |
| 399 | - | ('transl (filter string? lst)) | |
| 400 | - | ('topic (filter string? lst)) | |
| 401 | - | ('iron (filter string? lst)) | |
| 402 | - | ('specchar (filter string? lst)) | |
| 403 | - | ('scientif (filter string? lst)) | |
| 404 | - | ('wikide #f) | |
| 405 | - | ('wikija #f) | |
| 406 | - | ('link #f) | |
| 407 | - | ('ref (sxml->ref lst)) | |
| 408 | - | ('sref (sxml->ref lst)) | |
| 409 | - | ('etym (sxml->source lst)) | |
| 410 | - | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
| 411 | - | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 412 | - | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 413 | - | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 414 | - | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 415 | - | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
| 416 | - | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
| 417 | - | ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type)))) | |
| 418 | - | ('usg `(infos . ,(usg->infos lst))) | |
| 419 | - | ('sense (sxml->meaning lst)) | |
| 420 | - | ('steinhaus (let ((ref (sxml->string lst))) | |
| 421 | - | (if (string? ref) | |
| 422 | - | `(ref ,(sxml->string lst)) | |
| 423 | - | (throw 'not-steinhaus ref)))) | |
| 424 | - | ('pos '()); TODO: actually find what info to use | |
| 425 | - | ('wordcomponent (cons 'wordcomponent lst)) | |
| 426 | - | ('meishi (cons 'meishi lst)) | |
| 427 | - | ('setsuzokushi (cons 'setsuzokushi lst)) | |
| 428 | - | ('daimeishi (cons 'daimeishi lst)) | |
| 429 | - | ('doushi (cons 'doushi lst)) | |
| 430 | - | ('kandoushi (cons 'kandoushi lst)) | |
| 431 | - | ('keiyoudoushi (cons 'keiyoudoushi lst)) | |
| 432 | - | ('keiyoushi (cons 'keiyoushi lst)) | |
| 433 | - | ('fukushi (cons 'fukushi lst)) | |
| 434 | - | ('rengo (cons 'rengo lst)) | |
| 435 | - | ('suffix (cons 'suffix lst)) | |
| 436 | - | ('prefix (cons 'prefix lst)) | |
| 437 | - | ('kanji (cons 'kanji lst)) | |
| 438 | - | ('rentaishi (cons 'rentaishi lst)) | |
| 439 | - | ('specialcharacter (cons 'specialcharacter lst)) | |
| 440 | - | ('joshi (cons 'joshi lst)) | |
| 441 | - | ('fukujoshi (cons 'fukujoshi lst)) | |
| 442 | - | ('kakujoshi (cons 'kakujoshi lst)) | |
| 443 | - | ('kakarijoshi (cons 'kakarijoshi lst)) | |
| 444 | - | ('shuujoshi (cons 'shuujoshi lst)) | |
| 445 | - | ('setsuzokujoshi (cons 'setsuzokujoshi lst)) | |
| 446 | - | ('jokeiyoushi (cons 'jokeiyoushi lst)) | |
| 447 | - | ('jodoushi (cons 'jodoushi lst)) | |
| 448 | - | ('ruigos lst) | |
| 449 | - | ('ruigo (ruigo->ref lst)) | |
| 450 | - | ('gramGrp (gram->info lst))))) | |
| 354 | + | (match elem | |
| 355 | + | ('orth (let ((kanji (filter string? lst))) | |
| 356 | + | (if (null? kanji) | |
| 357 | + | #f | |
| 358 | + | `(kanji | |
| 359 | + | ,(string-filter | |
| 360 | + | (lambda (c) | |
| 361 | + | ;; Remove characters that interfere with kanji | |
| 362 | + | ;; based search | |
| 363 | + | (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\??? | |
| 364 | + | #\?? #\???)))) | |
| 365 | + | (car kanji)))))) | |
| 366 | + | ('count #f) | |
| 367 | + | ('entry (sxml->result lst frq)) | |
| 368 | + | ('hira `(reading ,(car lst))) | |
| 369 | + | ('hatsuon `(hatsuon ,(car lst))) | |
| 370 | + | ('accent `(pitch ,(car lst))) | |
| 371 | + | ('reading (sxml->reading lst)) | |
| 372 | + | ('form lst) | |
| 373 | + | ('impli `(impli ,(car lst))) | |
| 374 | + | ('text (let loop ((text "") (lst lst)) | |
| 375 | + | (match lst | |
| 376 | + | (() text) | |
| 377 | + | ((('hasPrecedingSpace . _) lst ...) | |
| 378 | + | (loop (string-append " " text) lst)) | |
| 379 | + | ((('hasFollowingSpace . _) lst ...) | |
| 380 | + | (string-append (loop text lst) " ")) | |
| 381 | + | (((? string? s) lst ...) | |
| 382 | + | (loop (string-append text s) lst))))) | |
| 383 | + | ('famn (sxml->string lst)) | |
| 384 | + | ('expl (sxml->string lst)) | |
| 385 | + | ('expli (filter list? lst)) | |
| 386 | + | ('abbrev (filter list? lst)) | |
| 387 | + | ('token (filter string? lst)) | |
| 388 | + | ('tr (sxml->string lst)) | |
| 389 | + | ('transcr `(transcr ,lst)) | |
| 390 | + | ('trans | |
| 391 | + | (append (filter list? lst) | |
| 392 | + | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
| 393 | + | ('jap `(jap ,lst)) | |
| 394 | + | ('emph lst) | |
| 395 | + | ('title (filter string? lst)) | |
| 396 | + | ('transl (filter string? lst)) | |
| 397 | + | ('topic (filter string? lst)) | |
| 398 | + | ('iron (filter string? lst)) | |
| 399 | + | ('specchar (filter string? lst)) | |
| 400 | + | ('scientif (filter string? lst)) | |
| 401 | + | ('wikide #f) | |
| 402 | + | ('wikija #f) | |
| 403 | + | ('link #f) | |
| 404 | + | ('ref (sxml->ref lst)) | |
| 405 | + | ('sref (sxml->ref lst)) | |
| 406 | + | ('etym (sxml->source lst)) | |
| 407 | + | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
| 408 | + | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 409 | + | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 410 | + | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 411 | + | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 412 | + | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
| 413 | + | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
| 414 | + | ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type)))) | |
| 415 | + | ('usg `(infos . ,(usg->infos lst))) | |
| 416 | + | ('sense (sxml->meaning lst)) | |
| 417 | + | ('steinhaus (let ((ref (sxml->string lst))) | |
| 418 | + | (if (string? ref) | |
| 419 | + | `(ref ,(sxml->string lst)) | |
| 420 | + | (throw 'not-steinhaus ref)))) | |
| 421 | + | ('pos '()); TODO: actually find what info to use | |
| 422 | + | ('wordcomponent (cons 'wordcomponent lst)) | |
| 423 | + | ('meishi (cons 'meishi lst)) | |
| 424 | + | ('setsuzokushi (cons 'setsuzokushi lst)) | |
| 425 | + | ('daimeishi (cons 'daimeishi lst)) | |
| 426 | + | ('doushi (cons 'doushi lst)) | |
| 427 | + | ('kandoushi (cons 'kandoushi lst)) | |
| 428 | + | ('keiyoudoushi (cons 'keiyoudoushi lst)) | |
| 429 | + | ('keiyoushi (cons 'keiyoushi lst)) | |
| 430 | + | ('fukushi (cons 'fukushi lst)) | |
| 431 | + | ('rengo (cons 'rengo lst)) | |
| 432 | + | ('suffix (cons 'suffix lst)) | |
| 433 | + | ('prefix (cons 'prefix lst)) | |
| 434 | + | ('kanji (cons 'kanji lst)) | |
| 435 | + | ('rentaishi (cons 'rentaishi lst)) | |
| 436 | + | ('specialcharacter (cons 'specialcharacter lst)) | |
| 437 | + | ('joshi (cons 'joshi lst)) | |
| 438 | + | ('fukujoshi (cons 'fukujoshi lst)) | |
| 439 | + | ('kakujoshi (cons 'kakujoshi lst)) | |
| 440 | + | ('kakarijoshi (cons 'kakarijoshi lst)) | |
| 441 | + | ('shuujoshi (cons 'shuujoshi lst)) | |
| 442 | + | ('setsuzokujoshi (cons 'setsuzokujoshi lst)) | |
| 443 | + | ('jokeiyoushi (cons 'jokeiyoushi lst)) | |
| 444 | + | ('jodoushi (cons 'jodoushi lst)) | |
| 445 | + | ('ruigos lst) | |
| 446 | + | ('ruigo (ruigo->ref lst)) | |
| 447 | + | ('gramGrp (gram->info lst)))) | |
| 451 | 448 | ||
| 452 | 449 | (define (create-parser frq) | |
| 453 | 450 | (define results '()) | |
… | |||
| 458 | 455 | ||
| 459 | 456 | FINISH-ELEMENT | |
| 460 | 457 | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 461 | - | (cond | |
| 462 | - | ((equal? elem-gi 'entries) | |
| 463 | - | results) | |
| 464 | - | ((equal? elem-gi 'entry) | |
| 465 | - | (set! results (cons seed results)) | |
| 466 | - | #f) | |
| 467 | - | (else | |
| 468 | - | (let* ((seed (reverse seed)) | |
| 469 | - | (element (sxml->element seed elem-gi frq))) | |
| 470 | - | (cons element parent-seed))))) | |
| 458 | + | (let ((elem-gi (match elem-gi | |
| 459 | + | ((_ . elem) elem) | |
| 460 | + | (_ elem-gi)))) | |
| 461 | + | (cond | |
| 462 | + | ((equal? elem-gi 'entries) | |
| 463 | + | results) | |
| 464 | + | ((equal? elem-gi 'entry) | |
| 465 | + | (let ((entry (sxml->result seed frq))) | |
| 466 | + | (set! results (cons entry results))) | |
| 467 | + | #f) | |
| 468 | + | (else | |
| 469 | + | (let* ((seed (reverse seed)) | |
| 470 | + | (element (sxml->element seed elem-gi frq))) | |
| 471 | + | (cons element parent-seed)))))) | |
| 471 | 472 | ||
| 472 | 473 | CHAR-DATA-HANDLER | |
| 473 | 474 | (lambda (string1 string2 seed) | |