jmdict: use ssax and serialize entry count
modules/nani/jmdict/entities.scm
| 1 | 1 | (define-module (nani jmdict entities) | |
| 2 | - | #:export (get-entity)) | |
| 3 | - | ||
| 4 | - | (define entities | |
| 5 | - | `(("MA" . 1) | |
| 6 | - | ("X" . 2) | |
| 7 | - | ("abbr" . 3) | |
| 8 | - | ("adj-i" . 4) | |
| 9 | - | ("adj-ix" . 5) | |
| 10 | - | ("adj-na" . 6) | |
| 11 | - | ("adj-no" . 7) | |
| 12 | - | ("adj-pn" . 8) | |
| 13 | - | ("adj-t" . 9) | |
| 14 | - | ("adj-f" . 10) | |
| 15 | - | ("adv" . 11) | |
| 16 | - | ("adv-to" . 12) | |
| 17 | - | ("arch" . 13) | |
| 18 | - | ("ateji" . 14) | |
| 19 | - | ("aux" . 15) | |
| 20 | - | ("aux-v" . 16) | |
| 21 | - | ("aux-adj" . 17) | |
| 22 | - | ("Buddh" . 18) | |
| 23 | - | ("chem" . 19) | |
| 24 | - | ("chn" . 20) | |
| 25 | - | ("col" . 21) | |
| 26 | - | ("comp" . 22) | |
| 27 | - | ("conj" . 23) | |
| 28 | - | ("cop" . 24) | |
| 29 | - | ("ctr" . 25) | |
| 30 | - | ("derog" . 26) | |
| 31 | - | ("eK" . 27) | |
| 32 | - | ("ek" . 28) | |
| 33 | - | ("exp" . 29) | |
| 34 | - | ("fam" . 30) | |
| 35 | - | ("fem" . 31) | |
| 36 | - | ("food" . 32) | |
| 37 | - | ("geom" . 33) | |
| 38 | - | ("gikun" . 34) | |
| 39 | - | ("hon" . 35) | |
| 40 | - | ("hum" . 36) | |
| 41 | - | ("iK" . 37) | |
| 42 | - | ("id" . 38) | |
| 43 | - | ("ik" . 39) | |
| 44 | - | ("int" . 40) | |
| 45 | - | ("io" . 41) | |
| 46 | - | ("iv" . 42) | |
| 47 | - | ("ling" . 43) | |
| 48 | - | ("m-sl" . 44) | |
| 49 | - | ("male" . 45) | |
| 50 | - | ("male-sl" . 46) | |
| 51 | - | ("math" . 47) | |
| 52 | - | ("mil" . 48) | |
| 53 | - | ("n" . 49) | |
| 54 | - | ("n-adv" . 50) | |
| 55 | - | ("n-suf" . 51) | |
| 56 | - | ("n-pref" . 52) | |
| 57 | - | ("n-t" . 53) | |
| 58 | - | ("num" . 54) | |
| 59 | - | ("oK" . 55) | |
| 60 | - | ("obs" . 56) | |
| 61 | - | ("obsc" . 57) | |
| 62 | - | ("ok" . 58) | |
| 63 | - | ("oik" . 59) | |
| 64 | - | ("on-mim" . 60) | |
| 65 | - | ("pn" . 61) | |
| 66 | - | ("poet" . 62) | |
| 67 | - | ("pol" . 63) | |
| 68 | - | ("pref" . 64) | |
| 69 | - | ("proverb" . 65) | |
| 70 | - | ("prt" . 66) | |
| 71 | - | ("physics" . 67) | |
| 72 | - | ("quote" . 68) | |
| 73 | - | ("rare" . 69) | |
| 74 | - | ("sens" . 70) | |
| 75 | - | ("sl" . 71) | |
| 76 | - | ("suf" . 72) | |
| 77 | - | ("uK" . 73) | |
| 78 | - | ("uk" . 74) | |
| 79 | - | ("unc" . 75) | |
| 80 | - | ("yoji" . 76) | |
| 81 | - | ("v1" . 77) | |
| 82 | - | ("v1-s" . 78) | |
| 83 | - | ("v2a-s" . 79) | |
| 84 | - | ("v4h" . 80) | |
| 85 | - | ("v4r" . 81) | |
| 86 | - | ("v5aru" . 82) | |
| 87 | - | ("v5b" . 83) | |
| 88 | - | ("v5g" . 84) | |
| 89 | - | ("v5k" . 85) | |
| 90 | - | ("v5k-s" . 86) | |
| 91 | - | ("v5m" . 87) | |
| 92 | - | ("v5n" . 88) | |
| 93 | - | ("v5r" . 89) | |
| 94 | - | ("v5r-i" . 90) | |
| 95 | - | ("v5s" . 91) | |
| 96 | - | ("v5t" . 92) | |
| 97 | - | ("v5u" . 93) | |
| 98 | - | ("v5u-s" . 94) | |
| 99 | - | ("v5uru" . 95) | |
| 100 | - | ("vz" . 96) | |
| 101 | - | ("vi" . 97) | |
| 102 | - | ("vk" . 98) | |
| 103 | - | ("vn" . 99) | |
| 104 | - | ("vr" . 100) | |
| 105 | - | ("vs" . 101) | |
| 106 | - | ("vs-c" . 102) | |
| 107 | - | ("vs-s" . 103) | |
| 108 | - | ("vs-i" . 104) | |
| 109 | - | ("kyb" . 105) | |
| 110 | - | ("osb" . 106) | |
| 111 | - | ("ksb" . 107) | |
| 112 | - | ("ktb" . 108) | |
| 113 | - | ("tsb" . 109) | |
| 114 | - | ("thb" . 110) | |
| 115 | - | ("tsug" . 111) | |
| 116 | - | ("kyu" . 112) | |
| 117 | - | ("rkb" . 113) | |
| 118 | - | ("nab" . 114) | |
| 119 | - | ("hob" . 115) | |
| 120 | - | ("vt" . 116) | |
| 121 | - | ("vulg" . 117) | |
| 122 | - | ("adj-kari" . 118) | |
| 123 | - | ("adj-ku" . 119) | |
| 124 | - | ("adj-shiku" . 120) | |
| 125 | - | ("adj-nari" . 121) | |
| 126 | - | ("n-pr" . 122) | |
| 127 | - | ("v-unspec" . 123) | |
| 128 | - | ("v4k" . 124) | |
| 129 | - | ("v4g" . 125) | |
| 130 | - | ("v4s" . 126) | |
| 131 | - | ("v4t" . 127) | |
| 132 | - | ("v4n" . 128) | |
| 133 | - | ("v4b" . 129) | |
| 134 | - | ("v4m" . 130) | |
| 135 | - | ("v2k-k" . 131) | |
| 136 | - | ("v2g-k" . 132) | |
| 137 | - | ("v2t-k" . 133) | |
| 138 | - | ("v2d-k" . 134) | |
| 139 | - | ("v2h-k" . 135) | |
| 140 | - | ("v2b-k" . 136) | |
| 141 | - | ("v2m-k" . 137) | |
| 142 | - | ("v2y-k" . 138) | |
| 143 | - | ("v2r-k" . 139) | |
| 144 | - | ("v2k-s" . 140) | |
| 145 | - | ("v2g-s" . 141) | |
| 146 | - | ("v2s-s" . 142) | |
| 147 | - | ("v2z-s" . 143) | |
| 148 | - | ("v2t-s" . 144) | |
| 149 | - | ("v2d-s" . 145) | |
| 150 | - | ("v2n-s" . 146) | |
| 151 | - | ("v2h-s" . 147) | |
| 152 | - | ("v2b-s" . 148) | |
| 153 | - | ("v2m-s" . 149) | |
| 154 | - | ("v2y-s" . 150) | |
| 155 | - | ("v2r-s" . 151) | |
| 156 | - | ("v2w-s" . 152) | |
| 157 | - | ("archit" . 153) | |
| 158 | - | ("astron" . 154) | |
| 159 | - | ("baseb" . 155) | |
| 160 | - | ("biol" . 156) | |
| 161 | - | ("bot" . 157) | |
| 162 | - | ("bus" . 158) | |
| 163 | - | ("econ" . 159) | |
| 164 | - | ("engr" . 160) | |
| 165 | - | ("finc" . 161) | |
| 166 | - | ("geol" . 162) | |
| 167 | - | ("law" . 163) | |
| 168 | - | ("mahj" . 164) | |
| 169 | - | ("med" . 165) | |
| 170 | - | ("music" . 166) | |
| 171 | - | ("Shinto" . 167) | |
| 172 | - | ("shogi" . 168) | |
| 173 | - | ("sports" . 169) | |
| 174 | - | ("sumo" . 170) | |
| 175 | - | ("zool" . 171) | |
| 176 | - | ("joc" . 172) | |
| 177 | - | ("anat" . 173) | |
| 178 | - | ("Christn" . 174) | |
| 179 | - | ("net-sl" . 175) | |
| 180 | - | ("dated" . 176) | |
| 181 | - | ("hist" . 177) | |
| 182 | - | ("lit" . 178) | |
| 183 | - | ("litf" . 179) | |
| 184 | - | ("surname" . 180) | |
| 185 | - | ("place" . 181) | |
| 186 | - | ("unclass" . 182) | |
| 187 | - | ("company" . 183) | |
| 188 | - | ("product" . 184) | |
| 189 | - | ("work" . 185) | |
| 190 | - | ("person" . 186) | |
| 191 | - | ("given" . 187) | |
| 192 | - | ("station" . 188) | |
| 193 | - | ("organization" . 189) | |
| 194 | - | ("frq500" . 190) | |
| 195 | - | ("frq1000" . 191) | |
| 196 | - | ("frq2000" . 192) | |
| 197 | - | ("frq5000" . 193) | |
| 198 | - | ("frq10000" . 194) | |
| 199 | - | ("frq20000" . 195))) | |
| 200 | - | ||
| 201 | - | (define (get-entity ent) | |
| 202 | - | (let ((val (assoc-ref entities ent))) | |
| 203 | - | (if val val (begin (pk 'unknown-entity-will-cause-error ent) #f)))) | |
| 2 | + | #:use-module (nani result) | |
| 3 | + | #:use-module (nani tags) | |
| 4 | + | #:re-export (get-tag get-points)) |
modules/nani/jmdict/serialize.scm
| 21 | 21 | #:use-module (nani result) | |
| 22 | 22 | #:use-module (nani serialize) | |
| 23 | 23 | #:use-module (nani trie) | |
| 24 | + | #:use-module (ice-9 binary-ports) | |
| 24 | 25 | #:use-module (rnrs bytevectors) | |
| 25 | - | #:export (serialize-jmdict)) | |
| 26 | + | #:export (serialize-jmdict | |
| 27 | + | jmdict-entry-count)) | |
| 26 | 28 | ||
| 27 | 29 | (define (serialize-pointer ptr pos bv) | |
| 28 | 30 | (bytevector-u8-set! bv pos (car ptr)) | |
… | |||
| 164 | 166 | (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size)) | |
| 165 | 167 | (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size | |
| 166 | 168 | reading-huffman-size sense-huffman-size | |
| 167 | - | results-size 0)))) | |
| 169 | + | results-size 4)))) | |
| 168 | 170 | (format #t "Number of nodes in kanjis: ~a~%" | |
| 169 | 171 | (trie-node-size kanji-trie)) | |
| 170 | 172 | (format #t "Number of nodes in readings: ~a~%" | |
… | |||
| 173 | 175 | (trie-node-size sense-trie)) | |
| 174 | 176 | (serialize-list results serialize-result (+ header-size 12 huffman-size) | |
| 175 | 177 | results-bv #:size? #f) | |
| 178 | + | ;; number of entries | |
| 179 | + | (serialize-int (length results) (+ header-size 12 huffman-size results-size) | |
| 180 | + | results-bv) | |
| 176 | 181 | (let* ((results (list->array 1 results)) | |
| 177 | 182 | (pos (bytevector-length results-bv)) | |
| 178 | 183 | (kanji-bvs (serialize-trie kanji-trie results pos)) | |
… | |||
| 200 | 205 | (format #t "senses is ~a bytes long~%" (apply + (map bytevector-length (cdr meaning-bvs)))) | |
| 201 | 206 | (merge-bvs (append (list results-bv) (cdr kanji-bvs) (cdr reading-bvs) | |
| 202 | 207 | (cdr meaning-bvs)))))) | |
| 208 | + | ||
| 209 | + | (define (jmdict-entry-count file) | |
| 210 | + | (call-with-input-file file | |
| 211 | + | (lambda (port) | |
| 212 | + | (let* ((header (utf8->string (get-bytevector-n port 14))) | |
| 213 | + | (pointers (get-bytevector-n port 12)) | |
| 214 | + | (kanji-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
| 215 | + | (seek port (- kanji-pos 4) SEEK_SET) | |
| 216 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) | |
modules/nani/jmdict/xml.scm
| 19 | 19 | (define-module (nani jmdict xml) | |
| 20 | 20 | #:use-module (ice-9 match) | |
| 21 | 21 | #:use-module (ice-9 rdelim) | |
| 22 | - | #:use-module (sxml fold) | |
| 22 | + | #:use-module (sxml ssax) | |
| 23 | 23 | #:use-module (nani frequency) | |
| 24 | 24 | #:use-module (sxml simple) | |
| 25 | 25 | #:use-module (nani result) | |
| 26 | 26 | #:use-module (nani jmdict entities) | |
| 27 | - | #:export (load-dic sxml->results)) | |
| 27 | + | #:export (load-dic xml->results)) | |
| 28 | 28 | ||
| 29 | 29 | (define (load-dic file) | |
| 30 | 30 | (xml->sxml (call-with-input-file file read-string))) | |
… | |||
| 38 | 38 | (('reading r) (update-reading reading #:readings (cons r (reading-readings reading)))) | |
| 39 | 39 | (('info r) (update-reading reading #:info (cons r (reading-info reading)))) | |
| 40 | 40 | (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading)))) | |
| 41 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 41 | 42 | ((? string? _) reading)) | |
| 42 | 43 | (cdr lst))))) | |
| 43 | 44 | ||
… | |||
| 48 | 49 | (loop | |
| 49 | 50 | (match (car lst) | |
| 50 | 51 | (('content c) (update-source source #:content (cons c (source-content source)))) | |
| 51 | - | (('wasei _) (update-source source #:wasei? #t)) | |
| 52 | - | (('type t) (update-source source #:type t)) | |
| 53 | - | (('lang l) (update-source source #:lang l)) | |
| 52 | + | (('ls_wasei _) (update-source source #:wasei? #t)) | |
| 53 | + | (('ls_type t) (update-source source #:type t)) | |
| 54 | + | ((('xml . 'lang) l) (update-source source #:lang l)) | |
| 55 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 54 | 56 | ((? list? l) (loop source l)) | |
| 55 | 57 | ((? string? _) source)) | |
| 56 | 58 | (cdr lst))))) | |
… | |||
| 68 | 70 | (('tag (? integer? r)) (update-sense sense #:tags (cons r (sense-tags sense)))) | |
| 69 | 71 | (('gloss (? string? r)) (update-sense sense #:glosses (cons r (sense-glosses sense)))) | |
| 70 | 72 | (('lang (? string? l)) (update-sense sense #:language l)) | |
| 73 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 71 | 74 | ((? list? l) (loop sense l)) | |
| 72 | 75 | ((? string? _) sense)) | |
| 73 | 76 | (cdr lst))))) | |
… | |||
| 94 | 97 | (sense1 (car (result-senses result))) | |
| 95 | 98 | (sense1 (update-sense sense1 #:tags (cons entity (sense-tags sense1)))) | |
| 96 | 99 | (senses (cons sense1 (cdr (result-senses result))))) | |
| 97 | - | (update-result result #:points (get-points (update-result result #:senses senses)))))) | |
| 100 | + | (update-result result | |
| 101 | + | #:points (get-points (update-result result #:senses senses)) | |
| 102 | + | #:senses senses)))) | |
| 98 | 103 | ||
| 99 | - | (define (get-points result) | |
| 100 | - | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
| 101 | - | (apply + (map (lambda (tag) | |
| 102 | - | (cond | |
| 103 | - | ;; frequency | |
| 104 | - | ((eq? tag (get-entity "frq500")) | |
| 105 | - | 32) | |
| 106 | - | ((eq? tag (get-entity "frq1000")) | |
| 107 | - | 16) | |
| 108 | - | ((eq? tag (get-entity "frq2000")) | |
| 109 | - | 8) | |
| 110 | - | ((eq? tag (get-entity "frq5000")) | |
| 111 | - | 4) | |
| 112 | - | ((eq? tag (get-entity "frq10000")) | |
| 113 | - | 2) | |
| 114 | - | ((eq? tag (get-entity "frq20000")) | |
| 115 | - | 1) | |
| 116 | - | ;; archaic term | |
| 117 | - | ((eq? tag (get-entity "arch")) | |
| 118 | - | -3) | |
| 119 | - | ;; children language | |
| 120 | - | ((eq? tag (get-entity "chn")) | |
| 121 | - | -1) | |
| 122 | - | ;; colloquialism | |
| 123 | - | ((eq? tag (get-entity "col")) | |
| 124 | - | 5) | |
| 125 | - | ;; expression | |
| 126 | - | ((eq? tag (get-entity "exp")) | |
| 127 | - | 1) | |
| 128 | - | ;; sonkeigo (honorific or respectful) | |
| 129 | - | ((eq? tag (get-entity "hon")) | |
| 130 | - | -1) | |
| 131 | - | ;; kenjougo (humble) | |
| 132 | - | ((eq? tag (get-entity "hum")) | |
| 133 | - | -1) | |
| 134 | - | ;; teineigo (polite) | |
| 135 | - | ((eq? tag (get-entity "pol")) | |
| 136 | - | -1) | |
| 137 | - | ;; irregular kanji | |
| 138 | - | ((eq? tag (get-entity "iK")) | |
| 139 | - | -2) | |
| 140 | - | ;; idiomatic expression | |
| 141 | - | ((eq? tag (get-entity "id")) | |
| 142 | - | 3) | |
| 143 | - | ;; slang | |
| 144 | - | ((member tag (list | |
| 145 | - | (get-entity "sl") | |
| 146 | - | (get-entity "m-sl") | |
| 147 | - | (get-entity "male-sl"))) | |
| 148 | - | -4) | |
| 149 | - | ;; outdated kanji | |
| 150 | - | ((eq? tag (get-entity "oK")) | |
| 151 | - | -2) | |
| 152 | - | ;; obsolete term | |
| 153 | - | ((eq? tag (get-entity "obs")) | |
| 154 | - | -5) | |
| 155 | - | ;; obscure term | |
| 156 | - | ((eq? tag (get-entity "obsc")) | |
| 157 | - | -10) | |
| 158 | - | ;; onomatopea | |
| 159 | - | ((eq? tag (get-entity "on-mim")) | |
| 160 | - | 1) | |
| 161 | - | ;; poetical term | |
| 162 | - | ((eq? tag (get-entity "poet")) | |
| 163 | - | -1) | |
| 164 | - | ((eq? tag (get-entity "proverb")) | |
| 165 | - | 1) | |
| 166 | - | ((eq? tag (get-entity "rare")) | |
| 167 | - | -3) | |
| 168 | - | ;; sensitive | |
| 169 | - | ((eq? tag (get-entity "sens")) | |
| 170 | - | -2) | |
| 171 | - | ;; dialect | |
| 172 | - | ((member tag (list | |
| 173 | - | (get-entity "kyb") | |
| 174 | - | (get-entity "osb") | |
| 175 | - | (get-entity "ksb") | |
| 176 | - | (get-entity "ktb") | |
| 177 | - | (get-entity "tsb") | |
| 178 | - | (get-entity "thb") | |
| 179 | - | (get-entity "tsug") | |
| 180 | - | (get-entity "kyu") | |
| 181 | - | (get-entity "rkb") | |
| 182 | - | (get-entity "nab") | |
| 183 | - | (get-entity "hob"))) | |
| 184 | - | -1) | |
| 185 | - | (else 0))) | |
| 186 | - | tags)))) | |
| 187 | - | ||
| 188 | - | (define (sxml->results sxml frq) | |
| 189 | - | (sort | |
| 190 | - | (foldt | |
| 191 | - | (lambda (xml) | |
| 192 | - | (match xml | |
| 193 | - | (('ent_seq _) "") | |
| 194 | - | (('ke_pri _) "") | |
| 195 | - | (('re_pri _) "") | |
| 196 | - | (('pri _) "") | |
| 197 | - | (('keb kanji) `(kanji ,kanji)) | |
| 198 | - | (('ke_inf _) "") | |
| 199 | - | (('k_ele lst ...) (car (filter list? lst))) | |
| 200 | - | (('reb reading) `(reading ,reading)) | |
| 201 | - | (('re_nokanji _ ...) "") | |
| 202 | - | (('re_restr r) `(limit ,r)) | |
| 203 | - | (('re_inf r) `(info ,r)) | |
| 204 | - | (('r_ele lst ...) (sxml->reading lst)) | |
| 205 | - | (('ant pos) `(ref ,pos)) | |
| 206 | - | (('dial pos) `(tag ,(get-entity pos))) | |
| 207 | - | (('field pos) `(tag ,(get-entity pos))) | |
| 208 | - | (('misc pos) `(tag ,(get-entity pos))) | |
| 209 | - | (('pos pos) `(tag ,(get-entity pos))) | |
| 210 | - | (('xref pos) `(ref ,pos)) | |
| 211 | - | (('g_type _ ...) "") | |
| 212 | - | (('gloss (? string? g)) xml) | |
| 213 | - | (('gloss (? list? g)) g) | |
| 214 | - | (('gloss attr g) (cons `(gloss ,g) attr)) | |
| 215 | - | (('stagk k) `(limit ,k)) | |
| 216 | - | (('stagr r) `(limit ,r)) | |
| 217 | - | (('s_inf r) `(info ,r)) | |
| 218 | - | (('lsource lst ...) (sxml->source lst)) | |
| 219 | - | (('sense lst ...) (sxml->sense lst)) | |
| 220 | - | (('entry lst ...) (sxml->result lst frq)) | |
| 221 | - | (('JMdict lst ...) lst) | |
| 222 | - | (('xml:lang l) `(lang ,l)) | |
| 223 | - | (('ls_wasei _) '(wasei #t)) | |
| 224 | - | (('ls_type t) `(type ,t)) | |
| 225 | - | (('@ lst ...) lst) | |
| 226 | - | (('*TOP* _ l) (filter result? l)) | |
| 227 | - | (('*PI* _ ...) #f))) | |
| 228 | - | (lambda (a) a) | |
| 229 | - | sxml) | |
| 230 | - | ;; reverse order: bigger score first | |
| 231 | - | (lambda (a b) (> (result-points a) (result-points b))))) | |
| 104 | + | (define (sxml->element lst elem frq) | |
| 105 | + | (match elem | |
| 106 | + | ('ent_seq "") | |
| 107 | + | ('ke_pri "") | |
| 108 | + | ('re_nokanji "") | |
| 109 | + | ('re_pri "") | |
| 110 | + | ('ke_inf "") | |
| 111 | + | ('misc (if (and (= (length lst) 1) (string? (car lst))) | |
| 112 | + | `(tag ,(get-tag (car lst))) | |
| 113 | + | (throw 'invalid-misc lst))) | |
| 114 | + | ('re_restr (if (and (= (length lst) 1) (string? (car lst))) | |
| 115 | + | `(limit ,(car lst)) | |
| 116 | + | (throw 'invalid-re_restr lst))) | |
| 117 | + | ('keb (if (and (= (length lst) 1) (string? (car lst))) | |
| 118 | + | `(kanji ,(car lst)) | |
| 119 | + | (throw 'invalid-keb lst))) | |
| 120 | + | ('s_inf (if (and (= (length lst) 1) (string? (car lst))) | |
| 121 | + | `(info ,(car lst)) | |
| 122 | + | (throw 'invalid-s_inf lst))) | |
| 123 | + | ('dial (if (and (= (length lst) 1) (string? (car lst))) | |
| 124 | + | `(tag ,(get-tag (car lst))) | |
| 125 | + | (throw 'invalid-dial lst))) | |
| 126 | + | ('re_inf (if (and (= (length lst) 1) (string? (car lst))) | |
| 127 | + | `(info ,(car lst)) | |
| 128 | + | (throw 'invalid-re_inf lst))) | |
| 129 | + | ('stagk (if (and (= (length lst) 1) (string? (car lst))) | |
| 130 | + | `(limit ,(car lst)) | |
| 131 | + | (throw 'invalid-stagk lst))) | |
| 132 | + | ('stagr (if (and (= (length lst) 1) (string? (car lst))) | |
| 133 | + | `(limit ,(car lst)) | |
| 134 | + | (throw 'invalid-stagr lst))) | |
| 135 | + | ('field (if (and (= (length lst) 1) (string? (car lst))) | |
| 136 | + | `(tag ,(get-tag (car lst))) | |
| 137 | + | (throw 'invalid-field lst))) | |
| 138 | + | ('ant (if (and (= (length lst) 1) (string? (car lst))) | |
| 139 | + | `(ref ,(car lst)) | |
| 140 | + | (throw 'invalid-ant lst))) | |
| 141 | + | ('reb `(reading ,lst)) | |
| 142 | + | ('r_ele (sxml->reading lst)) | |
| 143 | + | ('k_ele (car (filter list? lst))) | |
| 144 | + | ('pos (if (and (= (length lst) 1) (string? (car lst))) | |
| 145 | + | `(tag ,(get-tag (car lst))) | |
| 146 | + | (throw 'invalid-pos lst))) | |
| 147 | + | ('xref (if (and (= (length lst) 1) (string? (car lst))) | |
| 148 | + | `(ref ,(car lst)) | |
| 149 | + | (throw 'invalid-xref lst))) | |
| 150 | + | ('gloss (append | |
| 151 | + | (filter list? lst) | |
| 152 | + | (map (lambda (v) `(gloss ,v)) (filter string? lst)))) | |
| 153 | + | ('lsource (sxml->source lst)) | |
| 154 | + | ('sense (sxml->sense lst)) | |
| 155 | + | ('entry (sxml->result lst frq)))) | |
| 156 | + | ||
| 157 | + | (define (create-parser frq) | |
| 158 | + | (ssax:make-parser | |
| 159 | + | NEW-LEVEL-SEED | |
| 160 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 161 | + | (map | |
| 162 | + | (match-lambda | |
| 163 | + | ((k . v) (list k v))) | |
| 164 | + | (filter | |
| 165 | + | (match-lambda | |
| 166 | + | ((k . v) (not (member k '(g_type))))) | |
| 167 | + | attributes))) | |
| 168 | + | ||
| 169 | + | FINISH-ELEMENT | |
| 170 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 171 | + | (if (equal? elem-gi 'JMdict) | |
| 172 | + | seed | |
| 173 | + | (let* ((seed (reverse seed)) | |
| 174 | + | (element (sxml->element seed elem-gi frq))) | |
| 175 | + | (cons element parent-seed)))) | |
| 176 | + | ||
| 177 | + | CHAR-DATA-HANDLER | |
| 178 | + | (lambda (string1 string2 seed) | |
| 179 | + | (cons (string-append string1 string2) seed)))) | |
| 180 | + | ||
| 181 | + | (define (xml->results port frq) | |
| 182 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
| 183 | + | (sort results (lambda (a b) (> (result-points a) (result-points b)))))) | |
modules/nani/tags.scm unknown status 1
| 1 | + | (define-module (nani tags) | |
| 2 | + | #:use-module (nani result) | |
| 3 | + | #:export (get-tag get-points)) | |
| 4 | + | ||
| 5 | + | (define tags | |
| 6 | + | '("MA" "X" "abbr" "adj-i" "adj-ix" "adj-na" "adj-no" "adj-pn" "adj-t" "adj-f" | |
| 7 | + | "adv" "adv-to" "arch" "ateji" "aux" "aux-v" "aux-adj" "Buddh" "chem" "chn" | |
| 8 | + | "col" "comp" "conj" "cop" "ctr" "derog" "eK" "ek" "exp" "fam" "fem" "food" | |
| 9 | + | "geom" "gikun" "hon" "hum" "iK" "id" "ik" "int" "io" "iv" "ling" "m-sl" | |
| 10 | + | "male" "male-sl" "math" "meteor" "mil" "n" "n-adv" "n-suf" "n-pref" "n-t" | |
| 11 | + | "num" "oK" "obs" "obsc" "ok" "oik" "on-mim" "pn" "poet" "pol" "pref" "proverb" | |
| 12 | + | "prt" "physics" "quote" "rare" "sens" "sl" "suf" "uK" "uk" "unc" "yoji" | |
| 13 | + | "v1" "v1-s" "v2a-s" "v4h" "v4r" "v5aru" "v5b" "v5g" "v5k" "v5k-s" "v5m" | |
| 14 | + | "v5n" "v5r" "v5r-i" "v5s" "v5t" "v5u" "v5u-s" "v5uru" "vz" "vi" "vk" "vn" | |
| 15 | + | "vr" "vs" "vs-c" "vs-s" "vs-i" "kyb" "osb" "ksb" "ktb" "tsb" "thb" "tsug" | |
| 16 | + | "kyu" "rkb" "nab" "hob" "vt" "vulg" "adj-kari" "adj-ku" "adj-shiku" | |
| 17 | + | "adj-nari" "n-pr" "v-unspec" "v4k" "v4g" "v4s" "v4t" "v4n" "v4b" "v4m" | |
| 18 | + | "v2k-k" "v2g-k" "v2t-k" "v2d-k" "v2h-k" "v2b-k" "v2m-k" "v2y-k" "v2r-k" | |
| 19 | + | "v2k-s" "v2g-s" "v2s-s" "v2z-s" "v2t-s" "v2d-s" "v2n-s" "v2h-s" "v2b-s" | |
| 20 | + | "v2m-s" "v2y-s" "v2r-s" "v2w-s" "archit" "astron" "baseb" "biol" "bot" | |
| 21 | + | "bus" "econ" "engr" "finc" "geol" "law" "mahj" "med" "music" "Shinto" | |
| 22 | + | "shogi" "sports" "sumo" "zool" "joc" "anat" "Christn" "net-sl" "dated" | |
| 23 | + | "hist" "litf" "surname" "place" "unclass" "company" "product" "work" | |
| 24 | + | "person" "given" "station" "organization" | |
| 25 | + | ||
| 26 | + | ;; wadoku special | |
| 27 | + | "young" "thief" "secret" "baby" "stud" "sail" "fashion" "archeo" | |
| 28 | + | "rel" "color" "psy" "print" "aero" "politics" "anth" "biblio" | |
| 29 | + | "game" "agri" "alco" "demo" "train" "philo" "ethno" "photo" "trans" | |
| 30 | + | "telecom" "build" "sci" "school" "art" "hobby" | |
| 31 | + | ||
| 32 | + | "season-spring" "season-summer" "season-winter" "season-autumn" | |
| 33 | + | ||
| 34 | + | "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000")) | |
| 35 | + | ||
| 36 | + | (define (get-tag tag) | |
| 37 | + | (let ((val (member tag tags))) | |
| 38 | + | (if val (- (length val) 1) (throw 'unknown-tag tag)))) | |
| 39 | + | ||
| 40 | + | (define (get-points result) | |
| 41 | + | (define (tag-point tag) | |
| 42 | + | (cond | |
| 43 | + | ;; frequency | |
| 44 | + | ((eq? tag (get-tag "frq500")) | |
| 45 | + | 32) | |
| 46 | + | ((eq? tag (get-tag "frq1000")) | |
| 47 | + | 16) | |
| 48 | + | ((eq? tag (get-tag "frq2000")) | |
| 49 | + | 8) | |
| 50 | + | ((eq? tag (get-tag "frq5000")) | |
| 51 | + | 4) | |
| 52 | + | ((eq? tag (get-tag "frq10000")) | |
| 53 | + | 2) | |
| 54 | + | ((eq? tag (get-tag "frq20000")) | |
| 55 | + | 1) | |
| 56 | + | ;; archaic term | |
| 57 | + | ((eq? tag (get-tag "arch")) | |
| 58 | + | -3) | |
| 59 | + | ;; children language | |
| 60 | + | ((eq? tag (get-tag "chn")) | |
| 61 | + | -1) | |
| 62 | + | ;; colloquialism | |
| 63 | + | ((eq? tag (get-tag "col")) | |
| 64 | + | 5) | |
| 65 | + | ;; expression | |
| 66 | + | ((eq? tag (get-tag "exp")) | |
| 67 | + | 1) | |
| 68 | + | ;; sonkeigo (honorific or respectful) | |
| 69 | + | ((eq? tag (get-tag "hon")) | |
| 70 | + | -1) | |
| 71 | + | ;; kenjougo (humble) | |
| 72 | + | ((eq? tag (get-tag "hum")) | |
| 73 | + | -1) | |
| 74 | + | ;; teineigo (polite) | |
| 75 | + | ((eq? tag (get-tag "pol")) | |
| 76 | + | -1) | |
| 77 | + | ;; irregular kanji | |
| 78 | + | ((eq? tag (get-tag "iK")) | |
| 79 | + | -2) | |
| 80 | + | ;; idiomatic expression | |
| 81 | + | ((eq? tag (get-tag "id")) | |
| 82 | + | 3) | |
| 83 | + | ;; slang | |
| 84 | + | ((member tag (list | |
| 85 | + | (get-tag "sl") | |
| 86 | + | (get-tag "m-sl") | |
| 87 | + | (get-tag "male-sl"))) | |
| 88 | + | -4) | |
| 89 | + | ;; outdated kanji | |
| 90 | + | ((eq? tag (get-tag "oK")) | |
| 91 | + | -2) | |
| 92 | + | ;; obsolete term | |
| 93 | + | ((eq? tag (get-tag "obs")) | |
| 94 | + | -5) | |
| 95 | + | ;; obscure term | |
| 96 | + | ((eq? tag (get-tag "obsc")) | |
| 97 | + | -10) | |
| 98 | + | ;; onomatopea | |
| 99 | + | ((eq? tag (get-tag "on-mim")) | |
| 100 | + | 1) | |
| 101 | + | ;; poetical term | |
| 102 | + | ((eq? tag (get-tag "poet")) | |
| 103 | + | -1) | |
| 104 | + | ((eq? tag (get-tag "proverb")) | |
| 105 | + | 1) | |
| 106 | + | ((eq? tag (get-tag "rare")) | |
| 107 | + | -3) | |
| 108 | + | ;; sensitive | |
| 109 | + | ((eq? tag (get-tag "sens")) | |
| 110 | + | -2) | |
| 111 | + | ;; dialect | |
| 112 | + | ((member tag (list | |
| 113 | + | (get-tag "kyb") | |
| 114 | + | (get-tag "osb") | |
| 115 | + | (get-tag "ksb") | |
| 116 | + | (get-tag "ktb") | |
| 117 | + | (get-tag "tsb") | |
| 118 | + | (get-tag "thb") | |
| 119 | + | (get-tag "tsug") | |
| 120 | + | (get-tag "kyu") | |
| 121 | + | (get-tag "rkb") | |
| 122 | + | (get-tag "nab") | |
| 123 | + | (get-tag "hob"))) | |
| 124 | + | -1) | |
| 125 | + | (else 0))) | |
| 126 | + | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
| 127 | + | (apply + (map tag-point tags)))) |