1 ;;; org-tag-eldoc-wikipedia.el --- Web Scraping on Wikipedia -*- lexical-binding: t; -*-
2 ;; -*- coding: utf-8 -*-
4 ;; Copyright (C) 2024-2025 Christopher M. Miles, all rights reserved.
8 ;; search API: https://wikipedia.org/w/index.php?search=[query]
9 ;; tag URL: https://en.wikipedia.org/wiki/Computer_science
14 (require 'url-http
) ; for `url-http-end-of-headers'
17 (require 'elquery nil t
) ; optionally use elquery.el to replace dom.el.
18 (require 'org-tag-eldoc-common
)
21 (defun org-tag-eldoc-wikipedia--request (tag)
22 "Send HTTP GET request to get TAG explanation data."
23 (let ((request-backend 'url-retrieve
) ; use `url-retrieve' backend for proxy.
24 (request-message-level -
1)
25 (url-proxy-services org-tag-eldoc-request-proxy
)
26 (url (format "https://en.wikipedia.org/w/index.php?title=%s" (capitalize tag
))))
31 ((fboundp 'libxml-parse-html-region
) ; convert HTML -> Elisp alist structure
32 (libxml-parse-html-region (point-min) (point-max)))
33 ((featurep 'elquery
) ; convert HTML -> elquery object structure
34 (elquery-read-buffer (current-buffer)))))
36 (lambda (&key data
&allow-other-keys
)
37 ;; DEBUG: (setq request-result data)
40 ;; for `libxml' parser
41 (setq org-tag-eldoc--explanation
42 (org-tag-eldoc-common--format-explanation
44 (dom-search (dom-by-class data
"mw-content-ltr\\ mw-parser-output")
45 (lambda (node) (and (eq (cl-first node
) 'p
) (null (cl-second node
)))))))))
47 ;; for `elquery' parser
48 (setq org-tag-eldoc--explanation
49 (elquery-text (car (elquery-$
"#mw-content-text > div.mw-content-ltr.mw-parser-output > p" data
))))))))
51 (lambda (&rest args
&key error-thrown
&allow-other-keys
)
52 ;; (message "[org-tag-eldoc] (Wikipedia) request error %s!" error-thrown)
54 :status-code
'((404 .
(lambda (&rest _
) nil
))
55 (500 .
(lambda (&rest _
) nil
))))
57 ;; (with-current-buffer (url-retrieve-synchronously url)
59 ;; (goto-char url-http-end-of-headers)
60 ;; (libxml-parse-html-region (point) (point-max)))))
61 ;; ;; DEBUG: (setq request-result dom)
62 ;; (org-tag-eldoc-common--format-explanation
66 ;; (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
67 ;; (lambda (node) (and (eq (cl-first node) 'p) (null (cl-second node)))))))
68 ;; ((featurep 'elquery)
69 ;; (elquery-text (car (elquery-$ "#mw-content-text > div.mw-content-ltr.mw-parser-output > p" data))))))))
72 ;;; TEST: https://en.wikipedia.org/w/index.php?title=Computer_science
73 ;; (org-tag-eldoc-wikipedia--request "computer_science")
76 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
77 ;; (with-current-buffer
78 ;; (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=Computer_science")
80 ;; (goto-char url-http-end-of-headers)
81 ;; (libxml-parse-html-region (point) (point-max)))))
82 ;; (setq request-result dom)
83 ;; (setq org-tag-eldoc--explanation
85 ;; (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
86 ;; (lambda (node) (and (eq (cl-first node) 'p) (null (cl-second node))))))))))
90 ;; (dom-search (dom-by-class request-result "mw-content-ltr\\ mw-parser-output")
91 ;; (lambda (node) (and (eq (cl-first node) 'p) (null (cl-second node))))))
94 ;;; this example works fine.
95 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
96 ;; (with-current-buffer (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=GitHub")
97 ;; (let ((dom (libxml-parse-html-region (point-min) (point-max))))
98 ;; (let ((paragraph-node (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
99 ;; (lambda (node) (eq (car node) 'p)))))
100 ;; (setq node-github paragraph-node)
101 ;; (dom-texts paragraph-node)))))
103 ;; `node-github' ; looks fine
105 ;;; this example failed on `dom-texts'.
106 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
107 ;; (with-current-buffer (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=Computer_science")
108 ;; (let ((dom (libxml-parse-html-region (point-min) (point-max))))
109 ;; (let ((paragraph-node (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
110 ;; (lambda (node) (eq (car node) 'p)))))
111 ;; (setq node-computer-science paragraph-node)
112 ;; (dom-texts paragraph-node)))))
114 ;; `node-computer-science' ; looks fine
116 ;;; this examples works by a new `dom-search' rule.
117 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
118 ;; (with-current-buffer (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=Computer_science")
119 ;; (let ((dom (libxml-parse-html-region (point-min) (point-max))))
120 ;; (let ((paragraph-node (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
121 ;; (lambda (node) (and (eq (cl-first node) 'p)
122 ;; (null (cl-second node)))))))
123 ;; (setq node-computer-science paragraph-node)
124 ;; (dom-texts paragraph-node)))))
126 (defun org-tag-eldoc-wikipedia-query (tag)
127 "Query TAG on Wikipedia then return a cons cell of tag and explanation."
128 (if (stringp org-tag-eldoc--explanation
)
129 org-tag-eldoc--explanation
130 (org-tag-eldoc-wikipedia--request tag
)
132 (org-tag-eldoc-database-save tag org-tag-eldoc--explanation
)))
136 (provide 'org-tag-eldoc-wikipedia
)
138 ;;; org-tag-eldoc-wikipedia.el ends here