Build full wadoku dictionary
Makefile
| 1 | 1 | .PRECIOUS: po/%.po dictionaries/% | |
| 2 | 2 | ||
| 3 | + | all: site | |
| 4 | + | ||
| 3 | 5 | JMDICT_LANGS=dut fre ger hun rus slv spa swe | |
| 4 | 6 | ||
| 5 | 7 | DICOS=dicos/JMdict_e.nani $(addprefix dicos/JMdict_, $(addsuffix .nani, $(JMDICT_LANGS))) | |
| 6 | 8 | ||
| 9 | + | DOWNLOADS=dictionaries/JMdict_e.xml dictionaries/JMdict.xml dictionaries/frequency.tsv | |
| 10 | + | ||
| 11 | + | include radicals.mk | |
| 12 | + | include wadoku.mk | |
| 13 | + | ||
| 7 | 14 | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
| 8 | 15 | ||
| 9 | 16 | HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \ | |
… | |||
| 21 | 28 | LANGS=fr | |
| 22 | 29 | MOFILES=$(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS))) | |
| 23 | 30 | ||
| 24 | - | all: $(MOFILES) $(WEB_FILES) | |
| 31 | + | site: $(MOFILES) $(WEB_FILES) | |
| 25 | 32 | haunt build | |
| 26 | 33 | rm -rf public.bak | |
| 27 | 34 | mv public public.bak | |
| 28 | 35 | mv site public | |
| 29 | - | touch all | |
| 36 | + | touch site | |
| 30 | 37 | ||
| 31 | 38 | download: | |
| 32 | 39 | @rm -f dictionaries/* | |
| 33 | - | @$(MAKE) dictionaries/JMdict_e.xml dictionaries/JMdict.xml dictionaries/frequency.tsv dictionaries/wadoku.xml dictionaries/radkfilex.utf8 | |
| 40 | + | @$(MAKE) $(DOWNLOADS) | |
| 34 | 41 | ||
| 35 | 42 | # Download JMdict dictionaries from ERDRG | |
| 36 | 43 | dictionaries/%.xml: | |
… | |||
| 76 | 83 | ||
| 77 | 84 | dicos/list: $(DICOS) tools/list.scm $(MOFILES) | |
| 78 | 85 | guile -L modules -L . tools/list.scm $@ $(DICOS) | |
| 79 | - | ||
| 80 | - | include radicals.mk | |
| 81 | - | include wadoku.mk | |
modules/nani/tags.scm
| 30 | 30 | "telecom" "build" "sci" "school" "art" "hobby" | |
| 31 | 31 | ||
| 32 | 32 | "season-spring" "season-summer" "season-winter" "season-autumn" | |
| 33 | - | ||
| 33 | + | "season-newyear" | |
| 34 | + | ||
| 35 | + | "jlpt-1" "jlpt-2" "jlpt-3" "jlpt-4" "jlpt-5" | |
| 36 | + | ||
| 34 | 37 | "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000")) | |
| 35 | 38 | ||
| 36 | 39 | (define (get-tag tag) |
modules/nani/wadoku/entities.scm
| 20 | 20 | ("Anat." . "anat") | |
| 21 | 21 | ("Angeln" . "sports") | |
| 22 | 22 | ("Anime" . "work") | |
| 23 | - | ("Anthropol." . "antho") | |
| 23 | + | ("Anthropol." . "anth") | |
| 24 | 24 | ("arab. Gesch." . "hist") | |
| 25 | 25 | ("Arch??ol." . "archeo") | |
| 26 | 26 | ("Archit." . "archit") | |
… | |||
| 104 | 104 | ("dtsch. Gramm." . "ling") | |
| 105 | 105 | ("EDV" . "comp") | |
| 106 | 106 | ("Einzel-Kanji" . "ling") | |
| 107 | - | ("Streckenkilometertafel" . "train") | |
| 107 | + | ("Eisenb." . "engr") | |
| 108 | 108 | ("Eishockey" . "sports") | |
| 109 | 109 | ("Eiskunstl." . "sports") | |
| 110 | 110 | ("Eiskunstlauf" . "sports") | |
… | |||
| 141 | 141 | ("Frisur" . "fashion") | |
| 142 | 142 | ("Funkt." . "physics") | |
| 143 | 143 | ("Fu??ball" . "sports") | |
| 144 | - | ||
| 145 | 144 | ("Gagaku" . "music") | |
| 146 | - | ("Garten" . "bota") | |
| 147 | - | ("Gartenk." . "bota") | |
| 145 | + | ("Garten" . "bot") | |
| 146 | + | ("Gartenk." . "bot") | |
| 148 | 147 | ("Gateball" . "sports") | |
| 149 | 148 | ("Gebietsn." . "place") | |
| 150 | 149 | ("Geldw." . "finc") | |
… | |||
| 220 | 219 | ("Kanbun" . "ling") | |
| 221 | 220 | ("Kanji" . "ling") | |
| 222 | 221 | ("Kanp??" . "med") | |
| 223 | - | ("Karten" . "games") | |
| 222 | + | ("Karten" . "game") | |
| 224 | 223 | ("Kartenspiel" . "game") | |
| 225 | 224 | ("Kartogr." . "place") | |
| 226 | 225 | ("kath. Christent." . "Christn") | |
… | |||
| 285 | 284 | ("Mus." . "music") | |
| 286 | 285 | ("Muschelk." . "zool") | |
| 287 | 286 | ("Musikinstr." . "music") | |
| 288 | - | ("Mykol." . "bota") | |
| 287 | + | ("Mykol." . "bot") | |
| 289 | 288 | ("Mythol." . "rel") | |
| 290 | 289 | ("Nachrichtent." . "politics") | |
| 291 | 290 | ("N??hen" . "fashion") | |
… | |||
| 394 | 393 | ("Steuerw." . "econ") | |
| 395 | 394 | ("Sticken" . "build") | |
| 396 | 395 | ("Stra??enbau" . "engr") | |
| 396 | + | ("Streckenkilometertafel" . "train") | |
| 397 | 397 | ("Stricken" . "build") | |
| 398 | 398 | ("Strukturalismus" . "philo") | |
| 399 | 399 | ("Sum??" . "sumo") | |
modules/nani/wadoku/xml.scm
| 54 | 54 | (reg (usage-reg usage))) | |
| 55 | 55 | `(,@(if (string-null? content) | |
| 56 | 56 | (match type | |
| 57 | - | ("" '())) | |
| 57 | + | ("" '()) | |
| 58 | + | ("time" '()) | |
| 59 | + | ("hint" '()) | |
| 60 | + | ("abrev" '(tag "abbr"))) | |
| 58 | 61 | (match type | |
| 59 | 62 | ("dom" `((tag ,content))) | |
| 63 | + | ("time" `((tag "obs"))); always obsol. | |
| 60 | 64 | ("hint" (let ((tag (hint->tag content))) | |
| 61 | 65 | (if tag `(tag ,tag) '()))))) | |
| 62 | 66 | ,@(match reg | |
| 63 | 67 | ("" '()) | |
| 68 | + | ("dial." '()) | |
| 64 | 69 | ("lit" `((tag "litf"))) | |
| 65 | 70 | ("kinderspr." `((tag "chn"))) | |
| 66 | 71 | ("kleinkindspr." `((tag "baby"))) | |
| 67 | 72 | ("jugendspr." `((tag "young"))) | |
| 73 | + | ("besch.-h??fl." `((tag "hum"))) | |
| 68 | 74 | ("coll" `((tag "col"))) | |
| 69 | 75 | ("vulg." `((tag "vulg"))) | |
| 70 | 76 | ("Slang" `((tag "sl"))) | |
… | |||
| 167 | 173 | (loop | |
| 168 | 174 | (match (car lst) | |
| 169 | 175 | (('impli impli) (update-source source #:lang impli)) | |
| 170 | - | (('foreign foreign) (update-source source #:content foreign)) | |
| 176 | + | (('foreign foreign) | |
| 177 | + | (update-source source | |
| 178 | + | #:content (if (list? foreign) | |
| 179 | + | (append foreign (source-content source)) | |
| 180 | + | (cons foreign (source-content source))))) | |
| 171 | 181 | ((? list? l) (loop source l)) | |
| 172 | 182 | (_ source)) | |
| 173 | 183 | (cdr lst))))) | |
… | |||
| 287 | 297 | ((('meta . _) lst ...) | |
| 288 | 298 | (loop tags result lst)) | |
| 289 | 299 | ((('genki . genki) lst ...) | |
| 290 | - | (loop (cons `(tag ,genki) tags) result lst)) | |
| 300 | + | ;(loop (cons `(tag ,genki) tags) result lst)) | |
| 301 | + | (loop tags result lst)) | |
| 291 | 302 | ((('jlpt . jlpt) lst ...) | |
| 292 | 303 | (loop (cons `(tag ,(string-append "jlpt-" jlpt)) tags) result lst)) | |
| 293 | 304 | ((('ref . (? string? ref)) lst ...) | |
| 294 | 305 | (loop (cons `(ref ,ref) tags) result lst)) | |
| 306 | + | ((('ref (? string? ref)) lst ...) | |
| 307 | + | (loop (cons `(ref ,ref) tags) result lst)) | |
| 295 | 308 | ((('jap l) lst ...) | |
| 296 | 309 | (sub-loop loop tags result lst l)) | |
| 297 | 310 | ((('foreign l) lst ...) | |
… | |||
| 301 | 314 | ((((? symbol? s) v) lst ...) | |
| 302 | 315 | (throw 'unsupported-symbol s v)) | |
| 303 | 316 | ((((? symbol? s) . v) lst ...) | |
| 304 | - | (throw 'unsupported-symbol s v)) | |
| 317 | + | (throw 'unsupported-symbol-pair s v)) | |
| 305 | 318 | (((? list? l) lst ...) | |
| 306 | 319 | (sub-loop loop tags result lst l))))) | |
| 307 | 320 | ||
radicals.mk
| 1 | 1 | RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm | |
| 2 | 2 | DICOS+=dicos/radicals.nani | |
| 3 | + | DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml | |
| 3 | 4 | ||
| 4 | - | dictionaries/kradzip.zip: | |
| 5 | - | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O $@ | |
| 5 | + | dictionaries/radkfilex.utf8: | |
| 6 | + | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
| 7 | + | unzip dictionaries/kradzip.zip radkfilex -d dictionaries | |
| 8 | + | iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@ | |
| 9 | + | rm radkfilex | |
| 6 | 10 | ||
| 7 | - | dictionaries/radkfilex: dictionaries/kradzip.zip | |
| 8 | - | unzip $^ $$(basename $@) -d $$(dirname $@) | |
| 9 | - | touch $@ | |
| 10 | - | ||
| 11 | - | dictionaries/radkfilex.utf8: dictionaries/radkfilex | |
| 12 | - | iconv -f euc-jp -t utf-8 $^ > $@ | |
| 13 | - | ||
| 14 | - | dictionaries/kanjidic2.xml.gz: | |
| 15 | - | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@ | |
| 16 | - | ||
| 17 | - | dictionaries/kanjidic2.xml: dictionaries/kanjidic2.xml.gz | |
| 18 | - | gunzip $^ | |
| 11 | + | dictionaries/kanjidic2.xml: | |
| 12 | + | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz | |
| 13 | + | gunzip $@.gz | |
| 19 | 14 | ||
| 20 | 15 | dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES) | |
| 21 | 16 | guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@ |
tools/wadoku.scm
| 41 | 41 | results)) | |
| 42 | 42 | (results (filter (lambda (result) (not (null? (result-senses result)))) | |
| 43 | 43 | results))) | |
| 44 | - | (pk (car results)) | |
| 45 | - | (let ((readings (apply append (map result-readings results)))) | |
| 46 | - | (pk (apply append (map reading-readings readings)))) | |
| 47 | 44 | results)) | |
| 48 | 45 | ||
| 49 | 46 | (define (compile input sense-filter output) |
wadoku.mk
| 1 | 1 | WADOKU_TMP_DIR=dictionaries/wadoku-tmp | |
| 2 | 2 | DICOS+=dicos/wadoku_ger.nani | |
| 3 | + | DOWNLOADS+=dictionaries/wadoku.xml | |
| 3 | 4 | ||
| 4 | 5 | dictionaries/wadoku.xml: | |
| 5 | 6 | mkdir $(WADOKU_TMP_DIR) |