jmdict: use ssax and serialize entry count
modules/nani/jmdict/entities.scm
1 | 1 | (define-module (nani jmdict entities) | |
2 | - | #:export (get-entity)) | |
3 | - | ||
4 | - | (define entities | |
5 | - | `(("MA" . 1) | |
6 | - | ("X" . 2) | |
7 | - | ("abbr" . 3) | |
8 | - | ("adj-i" . 4) | |
9 | - | ("adj-ix" . 5) | |
10 | - | ("adj-na" . 6) | |
11 | - | ("adj-no" . 7) | |
12 | - | ("adj-pn" . 8) | |
13 | - | ("adj-t" . 9) | |
14 | - | ("adj-f" . 10) | |
15 | - | ("adv" . 11) | |
16 | - | ("adv-to" . 12) | |
17 | - | ("arch" . 13) | |
18 | - | ("ateji" . 14) | |
19 | - | ("aux" . 15) | |
20 | - | ("aux-v" . 16) | |
21 | - | ("aux-adj" . 17) | |
22 | - | ("Buddh" . 18) | |
23 | - | ("chem" . 19) | |
24 | - | ("chn" . 20) | |
25 | - | ("col" . 21) | |
26 | - | ("comp" . 22) | |
27 | - | ("conj" . 23) | |
28 | - | ("cop" . 24) | |
29 | - | ("ctr" . 25) | |
30 | - | ("derog" . 26) | |
31 | - | ("eK" . 27) | |
32 | - | ("ek" . 28) | |
33 | - | ("exp" . 29) | |
34 | - | ("fam" . 30) | |
35 | - | ("fem" . 31) | |
36 | - | ("food" . 32) | |
37 | - | ("geom" . 33) | |
38 | - | ("gikun" . 34) | |
39 | - | ("hon" . 35) | |
40 | - | ("hum" . 36) | |
41 | - | ("iK" . 37) | |
42 | - | ("id" . 38) | |
43 | - | ("ik" . 39) | |
44 | - | ("int" . 40) | |
45 | - | ("io" . 41) | |
46 | - | ("iv" . 42) | |
47 | - | ("ling" . 43) | |
48 | - | ("m-sl" . 44) | |
49 | - | ("male" . 45) | |
50 | - | ("male-sl" . 46) | |
51 | - | ("math" . 47) | |
52 | - | ("mil" . 48) | |
53 | - | ("n" . 49) | |
54 | - | ("n-adv" . 50) | |
55 | - | ("n-suf" . 51) | |
56 | - | ("n-pref" . 52) | |
57 | - | ("n-t" . 53) | |
58 | - | ("num" . 54) | |
59 | - | ("oK" . 55) | |
60 | - | ("obs" . 56) | |
61 | - | ("obsc" . 57) | |
62 | - | ("ok" . 58) | |
63 | - | ("oik" . 59) | |
64 | - | ("on-mim" . 60) | |
65 | - | ("pn" . 61) | |
66 | - | ("poet" . 62) | |
67 | - | ("pol" . 63) | |
68 | - | ("pref" . 64) | |
69 | - | ("proverb" . 65) | |
70 | - | ("prt" . 66) | |
71 | - | ("physics" . 67) | |
72 | - | ("quote" . 68) | |
73 | - | ("rare" . 69) | |
74 | - | ("sens" . 70) | |
75 | - | ("sl" . 71) | |
76 | - | ("suf" . 72) | |
77 | - | ("uK" . 73) | |
78 | - | ("uk" . 74) | |
79 | - | ("unc" . 75) | |
80 | - | ("yoji" . 76) | |
81 | - | ("v1" . 77) | |
82 | - | ("v1-s" . 78) | |
83 | - | ("v2a-s" . 79) | |
84 | - | ("v4h" . 80) | |
85 | - | ("v4r" . 81) | |
86 | - | ("v5aru" . 82) | |
87 | - | ("v5b" . 83) | |
88 | - | ("v5g" . 84) | |
89 | - | ("v5k" . 85) | |
90 | - | ("v5k-s" . 86) | |
91 | - | ("v5m" . 87) | |
92 | - | ("v5n" . 88) | |
93 | - | ("v5r" . 89) | |
94 | - | ("v5r-i" . 90) | |
95 | - | ("v5s" . 91) | |
96 | - | ("v5t" . 92) | |
97 | - | ("v5u" . 93) | |
98 | - | ("v5u-s" . 94) | |
99 | - | ("v5uru" . 95) | |
100 | - | ("vz" . 96) | |
101 | - | ("vi" . 97) | |
102 | - | ("vk" . 98) | |
103 | - | ("vn" . 99) | |
104 | - | ("vr" . 100) | |
105 | - | ("vs" . 101) | |
106 | - | ("vs-c" . 102) | |
107 | - | ("vs-s" . 103) | |
108 | - | ("vs-i" . 104) | |
109 | - | ("kyb" . 105) | |
110 | - | ("osb" . 106) | |
111 | - | ("ksb" . 107) | |
112 | - | ("ktb" . 108) | |
113 | - | ("tsb" . 109) | |
114 | - | ("thb" . 110) | |
115 | - | ("tsug" . 111) | |
116 | - | ("kyu" . 112) | |
117 | - | ("rkb" . 113) | |
118 | - | ("nab" . 114) | |
119 | - | ("hob" . 115) | |
120 | - | ("vt" . 116) | |
121 | - | ("vulg" . 117) | |
122 | - | ("adj-kari" . 118) | |
123 | - | ("adj-ku" . 119) | |
124 | - | ("adj-shiku" . 120) | |
125 | - | ("adj-nari" . 121) | |
126 | - | ("n-pr" . 122) | |
127 | - | ("v-unspec" . 123) | |
128 | - | ("v4k" . 124) | |
129 | - | ("v4g" . 125) | |
130 | - | ("v4s" . 126) | |
131 | - | ("v4t" . 127) | |
132 | - | ("v4n" . 128) | |
133 | - | ("v4b" . 129) | |
134 | - | ("v4m" . 130) | |
135 | - | ("v2k-k" . 131) | |
136 | - | ("v2g-k" . 132) | |
137 | - | ("v2t-k" . 133) | |
138 | - | ("v2d-k" . 134) | |
139 | - | ("v2h-k" . 135) | |
140 | - | ("v2b-k" . 136) | |
141 | - | ("v2m-k" . 137) | |
142 | - | ("v2y-k" . 138) | |
143 | - | ("v2r-k" . 139) | |
144 | - | ("v2k-s" . 140) | |
145 | - | ("v2g-s" . 141) | |
146 | - | ("v2s-s" . 142) | |
147 | - | ("v2z-s" . 143) | |
148 | - | ("v2t-s" . 144) | |
149 | - | ("v2d-s" . 145) | |
150 | - | ("v2n-s" . 146) | |
151 | - | ("v2h-s" . 147) | |
152 | - | ("v2b-s" . 148) | |
153 | - | ("v2m-s" . 149) | |
154 | - | ("v2y-s" . 150) | |
155 | - | ("v2r-s" . 151) | |
156 | - | ("v2w-s" . 152) | |
157 | - | ("archit" . 153) | |
158 | - | ("astron" . 154) | |
159 | - | ("baseb" . 155) | |
160 | - | ("biol" . 156) | |
161 | - | ("bot" . 157) | |
162 | - | ("bus" . 158) | |
163 | - | ("econ" . 159) | |
164 | - | ("engr" . 160) | |
165 | - | ("finc" . 161) | |
166 | - | ("geol" . 162) | |
167 | - | ("law" . 163) | |
168 | - | ("mahj" . 164) | |
169 | - | ("med" . 165) | |
170 | - | ("music" . 166) | |
171 | - | ("Shinto" . 167) | |
172 | - | ("shogi" . 168) | |
173 | - | ("sports" . 169) | |
174 | - | ("sumo" . 170) | |
175 | - | ("zool" . 171) | |
176 | - | ("joc" . 172) | |
177 | - | ("anat" . 173) | |
178 | - | ("Christn" . 174) | |
179 | - | ("net-sl" . 175) | |
180 | - | ("dated" . 176) | |
181 | - | ("hist" . 177) | |
182 | - | ("lit" . 178) | |
183 | - | ("litf" . 179) | |
184 | - | ("surname" . 180) | |
185 | - | ("place" . 181) | |
186 | - | ("unclass" . 182) | |
187 | - | ("company" . 183) | |
188 | - | ("product" . 184) | |
189 | - | ("work" . 185) | |
190 | - | ("person" . 186) | |
191 | - | ("given" . 187) | |
192 | - | ("station" . 188) | |
193 | - | ("organization" . 189) | |
194 | - | ("frq500" . 190) | |
195 | - | ("frq1000" . 191) | |
196 | - | ("frq2000" . 192) | |
197 | - | ("frq5000" . 193) | |
198 | - | ("frq10000" . 194) | |
199 | - | ("frq20000" . 195))) | |
200 | - | ||
201 | - | (define (get-entity ent) | |
202 | - | (let ((val (assoc-ref entities ent))) | |
203 | - | (if val val (begin (pk 'unknown-entity-will-cause-error ent) #f)))) | |
2 | + | #:use-module (nani result) | |
3 | + | #:use-module (nani tags) | |
4 | + | #:re-export (get-tag get-points)) |
modules/nani/jmdict/serialize.scm
21 | 21 | #:use-module (nani result) | |
22 | 22 | #:use-module (nani serialize) | |
23 | 23 | #:use-module (nani trie) | |
24 | + | #:use-module (ice-9 binary-ports) | |
24 | 25 | #:use-module (rnrs bytevectors) | |
25 | - | #:export (serialize-jmdict)) | |
26 | + | #:export (serialize-jmdict | |
27 | + | jmdict-entry-count)) | |
26 | 28 | ||
27 | 29 | (define (serialize-pointer ptr pos bv) | |
28 | 30 | (bytevector-u8-set! bv pos (car ptr)) | |
… | |||
164 | 166 | (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size)) | |
165 | 167 | (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size | |
166 | 168 | reading-huffman-size sense-huffman-size | |
167 | - | results-size 0)))) | |
169 | + | results-size 4)))) | |
168 | 170 | (format #t "Number of nodes in kanjis: ~a~%" | |
169 | 171 | (trie-node-size kanji-trie)) | |
170 | 172 | (format #t "Number of nodes in readings: ~a~%" | |
… | |||
173 | 175 | (trie-node-size sense-trie)) | |
174 | 176 | (serialize-list results serialize-result (+ header-size 12 huffman-size) | |
175 | 177 | results-bv #:size? #f) | |
178 | + | ;; number of entries | |
179 | + | (serialize-int (length results) (+ header-size 12 huffman-size results-size) | |
180 | + | results-bv) | |
176 | 181 | (let* ((results (list->array 1 results)) | |
177 | 182 | (pos (bytevector-length results-bv)) | |
178 | 183 | (kanji-bvs (serialize-trie kanji-trie results pos)) | |
… | |||
200 | 205 | (format #t "senses is ~a bytes long~%" (apply + (map bytevector-length (cdr meaning-bvs)))) | |
201 | 206 | (merge-bvs (append (list results-bv) (cdr kanji-bvs) (cdr reading-bvs) | |
202 | 207 | (cdr meaning-bvs)))))) | |
208 | + | ||
209 | + | (define (jmdict-entry-count file) | |
210 | + | (call-with-input-file file | |
211 | + | (lambda (port) | |
212 | + | (let* ((header (utf8->string (get-bytevector-n port 14))) | |
213 | + | (pointers (get-bytevector-n port 12)) | |
214 | + | (kanji-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
215 | + | (seek port (- kanji-pos 4) SEEK_SET) | |
216 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) |
modules/nani/jmdict/xml.scm
19 | 19 | (define-module (nani jmdict xml) | |
20 | 20 | #:use-module (ice-9 match) | |
21 | 21 | #:use-module (ice-9 rdelim) | |
22 | - | #:use-module (sxml fold) | |
22 | + | #:use-module (sxml ssax) | |
23 | 23 | #:use-module (nani frequency) | |
24 | 24 | #:use-module (sxml simple) | |
25 | 25 | #:use-module (nani result) | |
26 | 26 | #:use-module (nani jmdict entities) | |
27 | - | #:export (load-dic sxml->results)) | |
27 | + | #:export (load-dic xml->results)) | |
28 | 28 | ||
29 | 29 | (define (load-dic file) | |
30 | 30 | (xml->sxml (call-with-input-file file read-string))) | |
… | |||
38 | 38 | (('reading r) (update-reading reading #:readings (cons r (reading-readings reading)))) | |
39 | 39 | (('info r) (update-reading reading #:info (cons r (reading-info reading)))) | |
40 | 40 | (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading)))) | |
41 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
41 | 42 | ((? string? _) reading)) | |
42 | 43 | (cdr lst))))) | |
43 | 44 | ||
… | |||
48 | 49 | (loop | |
49 | 50 | (match (car lst) | |
50 | 51 | (('content c) (update-source source #:content (cons c (source-content source)))) | |
51 | - | (('wasei _) (update-source source #:wasei? #t)) | |
52 | - | (('type t) (update-source source #:type t)) | |
53 | - | (('lang l) (update-source source #:lang l)) | |
52 | + | (('ls_wasei _) (update-source source #:wasei? #t)) | |
53 | + | (('ls_type t) (update-source source #:type t)) | |
54 | + | ((('xml . 'lang) l) (update-source source #:lang l)) | |
55 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
54 | 56 | ((? list? l) (loop source l)) | |
55 | 57 | ((? string? _) source)) | |
56 | 58 | (cdr lst))))) | |
… | |||
68 | 70 | (('tag (? integer? r)) (update-sense sense #:tags (cons r (sense-tags sense)))) | |
69 | 71 | (('gloss (? string? r)) (update-sense sense #:glosses (cons r (sense-glosses sense)))) | |
70 | 72 | (('lang (? string? l)) (update-sense sense #:language l)) | |
73 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
71 | 74 | ((? list? l) (loop sense l)) | |
72 | 75 | ((? string? _) sense)) | |
73 | 76 | (cdr lst))))) | |
… | |||
94 | 97 | (sense1 (car (result-senses result))) | |
95 | 98 | (sense1 (update-sense sense1 #:tags (cons entity (sense-tags sense1)))) | |
96 | 99 | (senses (cons sense1 (cdr (result-senses result))))) | |
97 | - | (update-result result #:points (get-points (update-result result #:senses senses)))))) | |
100 | + | (update-result result | |
101 | + | #:points (get-points (update-result result #:senses senses)) | |
102 | + | #:senses senses)))) | |
98 | 103 | ||
99 | - | (define (get-points result) | |
100 | - | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
101 | - | (apply + (map (lambda (tag) | |
102 | - | (cond | |
103 | - | ;; frequency | |
104 | - | ((eq? tag (get-entity "frq500")) | |
105 | - | 32) | |
106 | - | ((eq? tag (get-entity "frq1000")) | |
107 | - | 16) | |
108 | - | ((eq? tag (get-entity "frq2000")) | |
109 | - | 8) | |
110 | - | ((eq? tag (get-entity "frq5000")) | |
111 | - | 4) | |
112 | - | ((eq? tag (get-entity "frq10000")) | |
113 | - | 2) | |
114 | - | ((eq? tag (get-entity "frq20000")) | |
115 | - | 1) | |
116 | - | ;; archaic term | |
117 | - | ((eq? tag (get-entity "arch")) | |
118 | - | -3) | |
119 | - | ;; children language | |
120 | - | ((eq? tag (get-entity "chn")) | |
121 | - | -1) | |
122 | - | ;; colloquialism | |
123 | - | ((eq? tag (get-entity "col")) | |
124 | - | 5) | |
125 | - | ;; expression | |
126 | - | ((eq? tag (get-entity "exp")) | |
127 | - | 1) | |
128 | - | ;; sonkeigo (honorific or respectful) | |
129 | - | ((eq? tag (get-entity "hon")) | |
130 | - | -1) | |
131 | - | ;; kenjougo (humble) | |
132 | - | ((eq? tag (get-entity "hum")) | |
133 | - | -1) | |
134 | - | ;; teineigo (polite) | |
135 | - | ((eq? tag (get-entity "pol")) | |
136 | - | -1) | |
137 | - | ;; irregular kanji | |
138 | - | ((eq? tag (get-entity "iK")) | |
139 | - | -2) | |
140 | - | ;; idiomatic expression | |
141 | - | ((eq? tag (get-entity "id")) | |
142 | - | 3) | |
143 | - | ;; slang | |
144 | - | ((member tag (list | |
145 | - | (get-entity "sl") | |
146 | - | (get-entity "m-sl") | |
147 | - | (get-entity "male-sl"))) | |
148 | - | -4) | |
149 | - | ;; outdated kanji | |
150 | - | ((eq? tag (get-entity "oK")) | |
151 | - | -2) | |
152 | - | ;; obsolete term | |
153 | - | ((eq? tag (get-entity "obs")) | |
154 | - | -5) | |
155 | - | ;; obscure term | |
156 | - | ((eq? tag (get-entity "obsc")) | |
157 | - | -10) | |
158 | - | ;; onomatopea | |
159 | - | ((eq? tag (get-entity "on-mim")) | |
160 | - | 1) | |
161 | - | ;; poetical term | |
162 | - | ((eq? tag (get-entity "poet")) | |
163 | - | -1) | |
164 | - | ((eq? tag (get-entity "proverb")) | |
165 | - | 1) | |
166 | - | ((eq? tag (get-entity "rare")) | |
167 | - | -3) | |
168 | - | ;; sensitive | |
169 | - | ((eq? tag (get-entity "sens")) | |
170 | - | -2) | |
171 | - | ;; dialect | |
172 | - | ((member tag (list | |
173 | - | (get-entity "kyb") | |
174 | - | (get-entity "osb") | |
175 | - | (get-entity "ksb") | |
176 | - | (get-entity "ktb") | |
177 | - | (get-entity "tsb") | |
178 | - | (get-entity "thb") | |
179 | - | (get-entity "tsug") | |
180 | - | (get-entity "kyu") | |
181 | - | (get-entity "rkb") | |
182 | - | (get-entity "nab") | |
183 | - | (get-entity "hob"))) | |
184 | - | -1) | |
185 | - | (else 0))) | |
186 | - | tags)))) | |
187 | - | ||
188 | - | (define (sxml->results sxml frq) | |
189 | - | (sort | |
190 | - | (foldt | |
191 | - | (lambda (xml) | |
192 | - | (match xml | |
193 | - | (('ent_seq _) "") | |
194 | - | (('ke_pri _) "") | |
195 | - | (('re_pri _) "") | |
196 | - | (('pri _) "") | |
197 | - | (('keb kanji) `(kanji ,kanji)) | |
198 | - | (('ke_inf _) "") | |
199 | - | (('k_ele lst ...) (car (filter list? lst))) | |
200 | - | (('reb reading) `(reading ,reading)) | |
201 | - | (('re_nokanji _ ...) "") | |
202 | - | (('re_restr r) `(limit ,r)) | |
203 | - | (('re_inf r) `(info ,r)) | |
204 | - | (('r_ele lst ...) (sxml->reading lst)) | |
205 | - | (('ant pos) `(ref ,pos)) | |
206 | - | (('dial pos) `(tag ,(get-entity pos))) | |
207 | - | (('field pos) `(tag ,(get-entity pos))) | |
208 | - | (('misc pos) `(tag ,(get-entity pos))) | |
209 | - | (('pos pos) `(tag ,(get-entity pos))) | |
210 | - | (('xref pos) `(ref ,pos)) | |
211 | - | (('g_type _ ...) "") | |
212 | - | (('gloss (? string? g)) xml) | |
213 | - | (('gloss (? list? g)) g) | |
214 | - | (('gloss attr g) (cons `(gloss ,g) attr)) | |
215 | - | (('stagk k) `(limit ,k)) | |
216 | - | (('stagr r) `(limit ,r)) | |
217 | - | (('s_inf r) `(info ,r)) | |
218 | - | (('lsource lst ...) (sxml->source lst)) | |
219 | - | (('sense lst ...) (sxml->sense lst)) | |
220 | - | (('entry lst ...) (sxml->result lst frq)) | |
221 | - | (('JMdict lst ...) lst) | |
222 | - | (('xml:lang l) `(lang ,l)) | |
223 | - | (('ls_wasei _) '(wasei #t)) | |
224 | - | (('ls_type t) `(type ,t)) | |
225 | - | (('@ lst ...) lst) | |
226 | - | (('*TOP* _ l) (filter result? l)) | |
227 | - | (('*PI* _ ...) #f))) | |
228 | - | (lambda (a) a) | |
229 | - | sxml) | |
230 | - | ;; reverse order: bigger score first | |
231 | - | (lambda (a b) (> (result-points a) (result-points b))))) | |
104 | + | (define (sxml->element lst elem frq) | |
105 | + | (match elem | |
106 | + | ('ent_seq "") | |
107 | + | ('ke_pri "") | |
108 | + | ('re_nokanji "") | |
109 | + | ('re_pri "") | |
110 | + | ('ke_inf "") | |
111 | + | ('misc (if (and (= (length lst) 1) (string? (car lst))) | |
112 | + | `(tag ,(get-tag (car lst))) | |
113 | + | (throw 'invalid-misc lst))) | |
114 | + | ('re_restr (if (and (= (length lst) 1) (string? (car lst))) | |
115 | + | `(limit ,(car lst)) | |
116 | + | (throw 'invalid-re_restr lst))) | |
117 | + | ('keb (if (and (= (length lst) 1) (string? (car lst))) | |
118 | + | `(kanji ,(car lst)) | |
119 | + | (throw 'invalid-keb lst))) | |
120 | + | ('s_inf (if (and (= (length lst) 1) (string? (car lst))) | |
121 | + | `(info ,(car lst)) | |
122 | + | (throw 'invalid-s_inf lst))) | |
123 | + | ('dial (if (and (= (length lst) 1) (string? (car lst))) | |
124 | + | `(tag ,(get-tag (car lst))) | |
125 | + | (throw 'invalid-dial lst))) | |
126 | + | ('re_inf (if (and (= (length lst) 1) (string? (car lst))) | |
127 | + | `(info ,(car lst)) | |
128 | + | (throw 'invalid-re_inf lst))) | |
129 | + | ('stagk (if (and (= (length lst) 1) (string? (car lst))) | |
130 | + | `(limit ,(car lst)) | |
131 | + | (throw 'invalid-stagk lst))) | |
132 | + | ('stagr (if (and (= (length lst) 1) (string? (car lst))) | |
133 | + | `(limit ,(car lst)) | |
134 | + | (throw 'invalid-stagr lst))) | |
135 | + | ('field (if (and (= (length lst) 1) (string? (car lst))) | |
136 | + | `(tag ,(get-tag (car lst))) | |
137 | + | (throw 'invalid-field lst))) | |
138 | + | ('ant (if (and (= (length lst) 1) (string? (car lst))) | |
139 | + | `(ref ,(car lst)) | |
140 | + | (throw 'invalid-ant lst))) | |
141 | + | ('reb `(reading ,lst)) | |
142 | + | ('r_ele (sxml->reading lst)) | |
143 | + | ('k_ele (car (filter list? lst))) | |
144 | + | ('pos (if (and (= (length lst) 1) (string? (car lst))) | |
145 | + | `(tag ,(get-tag (car lst))) | |
146 | + | (throw 'invalid-pos lst))) | |
147 | + | ('xref (if (and (= (length lst) 1) (string? (car lst))) | |
148 | + | `(ref ,(car lst)) | |
149 | + | (throw 'invalid-xref lst))) | |
150 | + | ('gloss (append | |
151 | + | (filter list? lst) | |
152 | + | (map (lambda (v) `(gloss ,v)) (filter string? lst)))) | |
153 | + | ('lsource (sxml->source lst)) | |
154 | + | ('sense (sxml->sense lst)) | |
155 | + | ('entry (sxml->result lst frq)))) | |
156 | + | ||
157 | + | (define (create-parser frq) | |
158 | + | (ssax:make-parser | |
159 | + | NEW-LEVEL-SEED | |
160 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
161 | + | (map | |
162 | + | (match-lambda | |
163 | + | ((k . v) (list k v))) | |
164 | + | (filter | |
165 | + | (match-lambda | |
166 | + | ((k . v) (not (member k '(g_type))))) | |
167 | + | attributes))) | |
168 | + | ||
169 | + | FINISH-ELEMENT | |
170 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
171 | + | (if (equal? elem-gi 'JMdict) | |
172 | + | seed | |
173 | + | (let* ((seed (reverse seed)) | |
174 | + | (element (sxml->element seed elem-gi frq))) | |
175 | + | (cons element parent-seed)))) | |
176 | + | ||
177 | + | CHAR-DATA-HANDLER | |
178 | + | (lambda (string1 string2 seed) | |
179 | + | (cons (string-append string1 string2) seed)))) | |
180 | + | ||
181 | + | (define (xml->results port frq) | |
182 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
183 | + | (sort results (lambda (a b) (> (result-points a) (result-points b)))))) |
modules/nani/tags.scm unknown status 1
1 | + | (define-module (nani tags) | |
2 | + | #:use-module (nani result) | |
3 | + | #:export (get-tag get-points)) | |
4 | + | ||
5 | + | (define tags | |
6 | + | '("MA" "X" "abbr" "adj-i" "adj-ix" "adj-na" "adj-no" "adj-pn" "adj-t" "adj-f" | |
7 | + | "adv" "adv-to" "arch" "ateji" "aux" "aux-v" "aux-adj" "Buddh" "chem" "chn" | |
8 | + | "col" "comp" "conj" "cop" "ctr" "derog" "eK" "ek" "exp" "fam" "fem" "food" | |
9 | + | "geom" "gikun" "hon" "hum" "iK" "id" "ik" "int" "io" "iv" "ling" "m-sl" | |
10 | + | "male" "male-sl" "math" "meteor" "mil" "n" "n-adv" "n-suf" "n-pref" "n-t" | |
11 | + | "num" "oK" "obs" "obsc" "ok" "oik" "on-mim" "pn" "poet" "pol" "pref" "proverb" | |
12 | + | "prt" "physics" "quote" "rare" "sens" "sl" "suf" "uK" "uk" "unc" "yoji" | |
13 | + | "v1" "v1-s" "v2a-s" "v4h" "v4r" "v5aru" "v5b" "v5g" "v5k" "v5k-s" "v5m" | |
14 | + | "v5n" "v5r" "v5r-i" "v5s" "v5t" "v5u" "v5u-s" "v5uru" "vz" "vi" "vk" "vn" | |
15 | + | "vr" "vs" "vs-c" "vs-s" "vs-i" "kyb" "osb" "ksb" "ktb" "tsb" "thb" "tsug" | |
16 | + | "kyu" "rkb" "nab" "hob" "vt" "vulg" "adj-kari" "adj-ku" "adj-shiku" | |
17 | + | "adj-nari" "n-pr" "v-unspec" "v4k" "v4g" "v4s" "v4t" "v4n" "v4b" "v4m" | |
18 | + | "v2k-k" "v2g-k" "v2t-k" "v2d-k" "v2h-k" "v2b-k" "v2m-k" "v2y-k" "v2r-k" | |
19 | + | "v2k-s" "v2g-s" "v2s-s" "v2z-s" "v2t-s" "v2d-s" "v2n-s" "v2h-s" "v2b-s" | |
20 | + | "v2m-s" "v2y-s" "v2r-s" "v2w-s" "archit" "astron" "baseb" "biol" "bot" | |
21 | + | "bus" "econ" "engr" "finc" "geol" "law" "mahj" "med" "music" "Shinto" | |
22 | + | "shogi" "sports" "sumo" "zool" "joc" "anat" "Christn" "net-sl" "dated" | |
23 | + | "hist" "litf" "surname" "place" "unclass" "company" "product" "work" | |
24 | + | "person" "given" "station" "organization" | |
25 | + | ||
26 | + | ;; wadoku special | |
27 | + | "young" "thief" "secret" "baby" "stud" "sail" "fashion" "archeo" | |
28 | + | "rel" "color" "psy" "print" "aero" "politics" "anth" "biblio" | |
29 | + | "game" "agri" "alco" "demo" "train" "philo" "ethno" "photo" "trans" | |
30 | + | "telecom" "build" "sci" "school" "art" "hobby" | |
31 | + | ||
32 | + | "season-spring" "season-summer" "season-winter" "season-autumn" | |
33 | + | ||
34 | + | "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000")) | |
35 | + | ||
36 | + | (define (get-tag tag) | |
37 | + | (let ((val (member tag tags))) | |
38 | + | (if val (- (length val) 1) (throw 'unknown-tag tag)))) | |
39 | + | ||
40 | + | (define (get-points result) | |
41 | + | (define (tag-point tag) | |
42 | + | (cond | |
43 | + | ;; frequency | |
44 | + | ((eq? tag (get-tag "frq500")) | |
45 | + | 32) | |
46 | + | ((eq? tag (get-tag "frq1000")) | |
47 | + | 16) | |
48 | + | ((eq? tag (get-tag "frq2000")) | |
49 | + | 8) | |
50 | + | ((eq? tag (get-tag "frq5000")) | |
51 | + | 4) | |
52 | + | ((eq? tag (get-tag "frq10000")) | |
53 | + | 2) | |
54 | + | ((eq? tag (get-tag "frq20000")) | |
55 | + | 1) | |
56 | + | ;; archaic term | |
57 | + | ((eq? tag (get-tag "arch")) | |
58 | + | -3) | |
59 | + | ;; children language | |
60 | + | ((eq? tag (get-tag "chn")) | |
61 | + | -1) | |
62 | + | ;; colloquialism | |
63 | + | ((eq? tag (get-tag "col")) | |
64 | + | 5) | |
65 | + | ;; expression | |
66 | + | ((eq? tag (get-tag "exp")) | |
67 | + | 1) | |
68 | + | ;; sonkeigo (honorific or respectful) | |
69 | + | ((eq? tag (get-tag "hon")) | |
70 | + | -1) | |
71 | + | ;; kenjougo (humble) | |
72 | + | ((eq? tag (get-tag "hum")) | |
73 | + | -1) | |
74 | + | ;; teineigo (polite) | |
75 | + | ((eq? tag (get-tag "pol")) | |
76 | + | -1) | |
77 | + | ;; irregular kanji | |
78 | + | ((eq? tag (get-tag "iK")) | |
79 | + | -2) | |
80 | + | ;; idiomatic expression | |
81 | + | ((eq? tag (get-tag "id")) | |
82 | + | 3) | |
83 | + | ;; slang | |
84 | + | ((member tag (list | |
85 | + | (get-tag "sl") | |
86 | + | (get-tag "m-sl") | |
87 | + | (get-tag "male-sl"))) | |
88 | + | -4) | |
89 | + | ;; outdated kanji | |
90 | + | ((eq? tag (get-tag "oK")) | |
91 | + | -2) | |
92 | + | ;; obsolete term | |
93 | + | ((eq? tag (get-tag "obs")) | |
94 | + | -5) | |
95 | + | ;; obscure term | |
96 | + | ((eq? tag (get-tag "obsc")) | |
97 | + | -10) | |
98 | + | ;; onomatopea | |
99 | + | ((eq? tag (get-tag "on-mim")) | |
100 | + | 1) | |
101 | + | ;; poetical term | |
102 | + | ((eq? tag (get-tag "poet")) | |
103 | + | -1) | |
104 | + | ((eq? tag (get-tag "proverb")) | |
105 | + | 1) | |
106 | + | ((eq? tag (get-tag "rare")) | |
107 | + | -3) | |
108 | + | ;; sensitive | |
109 | + | ((eq? tag (get-tag "sens")) | |
110 | + | -2) | |
111 | + | ;; dialect | |
112 | + | ((member tag (list | |
113 | + | (get-tag "kyb") | |
114 | + | (get-tag "osb") | |
115 | + | (get-tag "ksb") | |
116 | + | (get-tag "ktb") | |
117 | + | (get-tag "tsb") | |
118 | + | (get-tag "thb") | |
119 | + | (get-tag "tsug") | |
120 | + | (get-tag "kyu") | |
121 | + | (get-tag "rkb") | |
122 | + | (get-tag "nab") | |
123 | + | (get-tag "hob"))) | |
124 | + | -1) | |
125 | + | (else 0))) | |
126 | + | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
127 | + | (apply + (map tag-point tags)))) |