guile-jsonld/jsonld/download.scm

download.scm

1
;;;; Copyright (C) 2019, 2020 Julien Lepiller <julien@lepiller.eu>
2
;;;; 
3
;;;; This library is free software; you can redistribute it and/or
4
;;;; modify it under the terms of the GNU Lesser General Public
5
;;;; License as published by the Free Software Foundation; either
6
;;;; version 3 of the License, or (at your option) any later version.
7
;;;; 
8
;;;; This library is distributed in the hope that it will be useful,
9
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
10
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
;;;; Lesser General Public License for more details.
12
;;;; 
13
;;;; You should have received a copy of the GNU Lesser General Public
14
;;;; License along with this library; if not, write to the Free Software
15
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
;;;; 
17
18
(define-module (jsonld download)
19
  #:use-module (ice-9 match)
20
  #:use-module (iri iri)
21
  #:use-module (json)
22
  #:use-module (jsonld json)
23
  #:use-module (web client)
24
  #:use-module (web response)
25
  #:use-module (web uri)
26
  #:use-module (rnrs bytevectors)
27
  #:use-module (srfi srfi-1)
28
  #:use-module (srfi srfi-9)
29
  #:use-module (sxml simple)
30
  #:export (json-document
31
            make-json-document
32
            new-json-document
33
            json-document?
34
            json-document-context-url
35
            json-document-document-url
36
            json-document-document
37
            json-document-content-type
38
            json-document-profile
39
40
            download-json))
41
42
(define-record-type json-document
43
  (make-json-document context-url document-url document content-type profile)
44
  json-document?
45
  (context-url  json-document-context-url)
46
  (document-url json-document-document-url)
47
  (document     json-document-document)
48
  (content-type json-document-content-type)
49
  (profile      json-document-profile))
50
51
(define* (new-json-document #:key context-url document-url document content-type
52
                            profile)
53
  (make-json-document context-url document-url document content-type profile))
54
55
(define-record-type link-header
56
  (make-link-header uri params)
57
  link-header?
58
  (uri    link-header-uri)
59
  (params link-header-params))
60
61
(define-record-type content-type-header
62
  (make-content-type-header type params)
63
  content-type-header?
64
  (type   content-type-header-type)
65
  (params content-type-header-params))
66
67
(define (parse-link-header link)
68
  (let* ((header (string-split link #\;))
69
         (uri (car header))
70
         (uri (substring uri 1 (- (string-length uri) 1)))
71
         (params (cdr header))
72
         (params (map
73
                   (lambda (param)
74
                     (match (map
75
                              (lambda (l) (string-trim l #\space))
76
                              (string-split param #\=))
77
                       ((type val)
78
                        (cons type (substring val 1 (- (string-length val) 1))))))
79
                   params)))
80
    (make-link-header uri params)))
81
82
(define (parse-type type)
83
  (let* ((mime-type (string-split type #\;))
84
         (type (car mime-type))
85
         (params (map
86
                   (lambda (param)
87
                     (match (string-split param #\=)
88
                       ((type val ...)
89
                        (cons (string-trim type #\space)
90
                              (string-trim (string-join val "=") #\space)))))
91
                   (cdr mime-type))))
92
    (make-content-type-header type params)))
93
94
(define (html-document-base base document)
95
  (define (find-base document)
96
    (match document
97
      (('base ('@ attr ...))
98
       (let loop ((attr attr))
99
         (if (null? attr)
100
           #f
101
           (match (car attr)
102
             (('href base) base)
103
             (_ (loop (cdr attr)))))))
104
      ('() #f)
105
      ((tag ('@ opt ...) rest ...)
106
       (fold (lambda (element result)
107
               (or result (find-base element)))
108
             #f rest))
109
      ((tag rest ...)
110
       (fold (lambda (element result)
111
               (or result (find-base element)))
112
             #f rest))
113
      (_ #f)))
114
115
  (let ((new-base (find-base document)))
116
    (if new-base
117
        (resolve-iri base new-base)
118
        base)))
119
120
(define (get-script-elements document)
121
  (define (find-scripts document)
122
    (match document
123
      (('script t ...)
124
       (list document))
125
      ('() '())
126
      ((tag ('@ opt ...) rest ...)
127
       (append-map get-script-elements rest))
128
      ((tag rest ...)
129
       (append-map get-script-elements rest))
130
      (_ '())))
131
132
  (define (jsonld-script? element)
133
    (match element
134
      (('script ('@ opt ...) content)
135
       (let* ((types (filter (lambda (opt) (equal? (car opt) 'type)) opt))
136
              (type (if (null? types) #f (cadr (car types)))))
137
         (and type
138
              (equal? (content-type-header-type (parse-type type))
139
                      "application/ld+json"))))
140
      (_ #f)))
141
142
  (filter jsonld-script? (find-scripts document)))
143
144
(define (get-script-content-by-id document id)
145
  (define (has-correct-id? element)
146
    (match element
147
      ((tag ('@ opts ...) content)
148
       (let* ((ids (filter (lambda (o) (equal? (car o) 'id)) opts))
149
              (element-id (if (null? ids) #f (cadr (car ids)))))
150
         (equal? element-id id)))))
151
152
  (let ((id-scripts (filter has-correct-id? (get-script-elements document))))
153
    (when (null? id-scripts)
154
      (throw 'loading-document-failed))
155
    (get-script-content (car id-scripts))))
156
157
(define (get-script-content-by-profile document profile)
158
  (define (has-correct-profile? element)
159
    (match element
160
      ((tag ('@ opts ...) content)
161
       (let* ((types (filter (lambda (o) (equal? (car o) 'type)) opts))
162
              (type (if (null? types) #f (cadr (car types)))))
163
         (and type
164
              (equal?
165
                (assoc-ref
166
                  (content-type-header-params (parse-type type))
167
                  "profile")
168
                profile))))))
169
170
  (let ((profile-scripts (filter has-correct-profile?
171
                                 (get-script-elements document))))
172
    (when (null? profile-scripts)
173
      (throw 'loading-document-failed))
174
    (get-script-content (car profile-scripts))))
175
176
(define (get-first-script-content document)
177
  (let ((scripts (get-script-elements document)))
178
    (when (null? scripts)
179
      (throw 'loading-document-failed))
180
    (get-script-content (car scripts))))
181
182
(define (get-script-content element)
183
  (match element
184
     (('script ('@ opts ...) content)
185
      content)))
186
187
(define (reverse-document document)
188
  "Reverse the order of elements in @var{document}, a Json object, array or
189
scalar.  This recursively reverses the order of any array or key-pair
190
association."
191
  (match document
192
    ((? json-array? document)
193
     (list->array 1 (map reverse-document (array->list document))))
194
    ((? json-object? document)
195
     (fold (lambda (pair result) (cons (cons (car pair) (reverse-document (cdr pair))) result)) '() document))
196
    (_ document)))
197
198
(define (links-with-rel links rel)
199
  (filter
200
    (lambda (link)
201
      (equal?
202
        (assoc-ref (link-header-params link) "rel")
203
        rel))
204
    links))
205
206
(define (html->sxml document)
207
  "Convert document (a string) into an sxml representation that preserves entities"
208
  (xml->sxml document
209
             #:entities
210
             '((lt . "&amp;lt;")
211
               (gt . "&amp;gt;")
212
               ;(amp . "&amp;amp;")
213
               (apos . "&amp;apos;"))
214
             #:default-entity-handler
215
             (lambda (port name)
216
               (string-append "&amp;" (symbol->string name) ";"))))
217
218
(define* (download-json uri #:key (mime-type "application/ld+json")
219
                        (extract-all-scripts? #f)
220
                        (profile #f) (request-profile #f)
221
                        ;; Used for tests
222
                        (http-get http-get))
223
  "Download a JsonLD document from @var{uri}, using the HTTP protocol.  The
224
Accept HTTP header can be modified with @var{mime-type}."
225
  (define complete-mime-type
226
    (string-append
227
      (if request-profile
228
          (string-append mime-type ";profile=" request-profile)
229
          mime-type)
230
      (if (equal? request-profile "http://www.w3.org/ns/json-ld#context")
231
          ""
232
          ", text/html;q=0.8; application/xhtml+xml;q=0.5")))
233
234
  (catch #t
235
    (lambda ()
236
      (call-with-values
237
        (lambda ()
238
          (http-get uri #:headers `((Accept . ,complete-mime-type))))
239
        (lambda (hdr body)
240
          (cond
241
            ((member (response-code hdr) '(301 302 303 307))
242
             (download-json (uri->string (response-location hdr))
243
                            #:mime-type mime-type
244
                            #:extract-all-scripts? extract-all-scripts?
245
                            #:profile profile
246
                            #:request-profile request-profile))
247
            ((equal? (response-code hdr) 200)
248
             (let* ((content-type (response-content-type hdr))
249
                    (type (symbol->string (car content-type)))
250
                    (link-headers
251
                      (map cdr (filter
252
                                 (lambda (p) (equal? (car p) 'link))
253
                                 (response-headers hdr))))
254
                    (links (map parse-link-header link-headers))
255
                    (alternates (links-with-rel links "alternate"))
256
                    (alternates (filter
257
                                  (lambda (alternate)
258
                                    (equal?
259
                                      (assoc-ref (link-header-params alternate) "type")
260
                                      "application/ld+json"))
261
                                  alternates))
262
                    (contexts (links-with-rel links "http://www.w3.org/ns/json-ld#context"))
263
                    (context-url #f)
264
                    (document #f)
265
                    (base (if (string? uri) uri (uri->string uri))))
266
               (if (and (not (equal? type "application/json"))
267
                        (or
268
                          (< (string-length type) 5)
269
                          (not (equal?
270
                               (substring type
271
                                          (- (string-length type) 5))
272
                               "+json")))
273
                        (not (null? alternates)))
274
                   (download-json (resolve-iri base (link-header-uri (car alternates)))
275
                                  #:mime-type mime-type
276
                                  #:extract-all-scripts? extract-all-scripts?
277
                                  #:profile profile
278
                                  #:request-profile request-profile)
279
                   (begin
280
                     (when (and (or (equal? type "application/json")
281
                                    (and (> (string-length type) 5)
282
                                         (equal? (substring
283
                                                   type (- (string-length type) 5))
284
                                                 "+json")))
285
                                (not (equal? type "application/ld+json"))
286
                                (not (null? contexts)))
287
                       (set! context-url
288
                         (resolve-iri base (link-header-uri (car contexts))))
289
                       (when (> (length contexts) 1)
290
                         (throw 'multiple-context-link-headers)))
291
                     (cond
292
                       ((or (equal? type "application/json")
293
                            (and (> (string-length type) 5)
294
                                 (equal? (substring type
295
                                                    (- (string-length type) 5))
296
                                         "+json")))
297
                        (set! document (reverse-document
298
                                         (json-string->scm
299
                                           (if (string? body)
300
                                               body
301
                                               (utf8->string body))))))
302
                       ((or (equal? type "text/html")
303
                            (equal? type "application/xhtml+xml"))
304
                         (let* ((content (html->sxml
305
                                           (if (string? body)
306
                                               body
307
                                               (utf8->string body))))
308
                                (uri (string->uri base))
309
                                (source #f))
310
                           (set! base (html-document-base base content))
311
                           (when (uri-fragment uri)
312
                             (set! source (get-script-content-by-id
313
                                            content (uri-fragment uri))))
314
                           (when (and (equal? source #f) profile)
315
                             (set! source (get-script-content-by-profile
316
                                            content profile)))
317
                           (when (and (equal? source #f) (not extract-all-scripts?))
318
                             (set! source (get-first-script-content content)))
319
                           (if source
320
                               (catch #t
321
                                 (lambda ()
322
                                   (set! document (reverse-document
323
                                                    (json-string->scm source))))
324
                                 (lambda _
325
                                   (throw 'invalid-script-element source)))
326
                               (begin
327
                                 (unless extract-all-scripts?
328
                                   (throw 'loading-document-failed))
329
                                 (set! document '())
330
                                 (for-each
331
                                   (lambda (el)
332
                                     (catch #t
333
                                       (lambda ()
334
                                         (let ((script-content
335
                                                 (reverse-document
336
                                                   (json-string->scm
337
                                                     (get-script-content el)))))
338
                                           (set! document
339
                                             (append document
340
                                                     (if (json-array? script-content)
341
                                                         (array->list script-content)
342
                                                         (list script-content))))))
343
                                       (lambda _
344
                                         (throw 'invalid-script-element el))))
345
                                   (get-script-elements content))
346
                                 (set! document (list->array 1 document))))))
347
                       (else
348
                         (throw 'loading-remote-document-failed)))
349
                     (new-json-document
350
                       #:document-url base
351
                       #:document document
352
                       #:content-type type
353
                       #:context-url context-url)))))
354
           (else (throw 'not-found))))))
355
    (lambda (key . value)
356
      (cond
357
        ((equal? key 'multiple-context-link-headers)
358
         (apply throw 'multiple-context-link-headers value))
359
        ((equal? key 'invalid-script-element)
360
         (apply throw 'invalid-script-element value))
361
        (else (apply throw 'loading-remote-document-failed key value))))))
362