clj-xpath - Simplified XPath for Clojure

Overview

clj-xpath wraps Java’s XPath APIs to make it significantly easier for common use cases.

API Documentation

Project Dependencies

For Clojure, add the following to your project.clj file:

 [com.github.kyleburton/clj-xpath "1.4.13"]

For maven, add the following dependency to your pom.xml:

<dependency>
  <groupId>com.github.kyleburton</groupId>
  <artifactId>clj-xpath</artifactId>
  <version>1.4.13</version>
</dependency>

Example

;; This is an exmaple of using clj-xpath.  We'll grab an RSS
;; feed (an XML document) and we'lll use clj-xpath to explore the
;; structure of the document as well as extract its contents.

(ns clj-xpath-examples.core
  (:require
   [clojure.string :as string]
   [clojure.pprint :as pp])
  (:use
   clj-xpath.core))

;; Lets define an URL to an RSS feed...
(def hackernews-rss-url "http://news.ycombinator.com/rss")

;; Now lets fetch the RSS, remembering it so we don't fetch it more
;; than once (hitting Hacker News multiple times would be rude).
(def hackernews-rss-xml
     (memoize (fn [] (slurp hackernews-rss-url))))

(comment
  ;; We can see that the XML loaded by asking for the character count:
  (count (hackernews-rss-xml))
  ;;  => 9968
  )

;; Now lets parse the XML into a Document.  It is not strictly
;; necessary to memoize, but this provides a cache of the parsed
;; document which will make operations on the document faster.
(def xmldoc
     (memoize (fn [] (xml->doc (hackernews-rss-xml)))))

(comment
  ;; With a document, we can now perform some XPath operations on the document:

  ;; extract the root node:
  ;; NB: I'm removing :text (the dissoc) because it's large and not very useful at this point...
  (->
   ($x "/*" (xmldoc))
   first
   (dissoc :text))
  ;; => {:node #<DeferredElementImpl [rss: null]>, :tag :rss, :attrs {:version "2.0"}}

  ;; extract the root node's tag:
  ($x:tag "/*" (xmldoc))
  ;; => :rss

  ;; extract the channel's title link and description:
  ($x:text "/rss/channel/title" (xmldoc))
  ;; => "Hacker News"

  ($x:text "/rss/channel/link" (xmldoc))
  ;; => "http://news.ycombinator.com/"

  ($x:text "/rss/channel/description" (xmldoc))
  ;; => "Links for the intellectually curious, ranked by readers."

  ;; If you're not intimately familiar with the structure of your XML
  ;; seeing what it looks like can be a pain, so we'll define a few
  ;; more helper functions...
  )

;; Traverses the entire document, returning a distincted list of the
;; tags (in no particular order).
(defn all-tags [doc]
  (map
   ;; turn the keywords into strings
   name
   (seq
    ;; reduce the stream of nodes into a distinct list
    (reduce
     (fn [acc node]
       (conj acc (:tag node)))
     #{}
     ;; tree-seq flattens the document into a one-dimensional stream
     ;; of nodes:
     (tree-seq (fn [n] (:node n))
               (fn [n] ($x "./*" n))
               (first ($x "./*" doc)))))))

(coment

 ;; Lets use that to see what all the tags are:
 (all-tags (xmldoc))
 ;; ("link" "item" "title" "channel" "rss" "comments" "description")

 ;; That helps a bit, but it's still not too useful, as it doesn't
 ;; show us how they nest.  Lets try another helper function...
 )

;; Better would be to visit each node in the document, keeping track
;; of the path.  The callback to this function will be passed 2
;; arguments:
;;  * path: the path to the node being visited
;;  * node: the node being visited
(defn visit-nodes
  ([path nodes f]
     (vec
      (mapcat
       #(vec
         (cons
          ;; invoke the callback on the each of the nodes
          (f (conj path (:tag %1)) %1)
          ;; visit each of the children of this node
          (visit-nodes
           (conj path (:tag %1))
           ($x "./*" %1) f)))
       nodes))))

(comment
  ;; Visit each part of the document (tree) and print out what the path to each tag looks like:
  (visit-nodes []
               ($x "./*" (xmldoc))
               (fn [p n]
                 (printf "%s tag:%s\n"
                         (apply str (interpose "/" (map name p)))
                         (name (:tag n)))))
  ;; Here's the output from the above:
  ;;     rss tag:rss
  ;;     rss/channel tag:channel
  ;;     rss/channel/title tag:title
  ;;     rss/channel/link tag:link
  ;;     rss/channel/description tag:description
  ;;     rss/channel/item tag:item

  )

;; This is pretty much what we want, lets wrap it up into a
;; funciton...
(defn all-paths [doc]
  (map
   #(str "/" (string/join "/" (map name %1)))
   (first
    (reduce
     (fn [[acc set] p]
       (if (contains? set p)
         [acc set]
         [(conj acc p) (conj set p)]))
     [[] #{}]
     (visit-nodes []
                  ($x "./*" doc)
                  (fn [p n]
                    p))))))

(comment
  ;; Let's try it out:
  (all-paths (xmldoc))
  ;; => ("/rss" "/rss/channel" "/rss/channel/title" "/rss/channel/link" "/rss/channel/description" "/rss/channel/item" "/rss/channel/item/title" "/rss/channel/item/link" "/rss/channel/item/comments" "/rss/channel/item/description")

  ;; Format it bit nicer:
  (doseq [p (all-paths (xmldoc))]
    (println p))

  ;; There, a nice list of all the paths in the document:
  ;;   /rss
  ;;   /rss/channel
  ;;   /rss/channel/title
  ;;   /rss/channel/link
  ;;   /rss/channel/description
  ;;   /rss/channel/item
  ;;   /rss/channel/item/title
  ;;   /rss/channel/item/link
  ;;   /rss/channel/item/comments
  ;;   /rss/channel/item/description


  ;; Now that we can see a bit of the lay of the land, lets grab the first few items' title and link:
  (pp/pprint
   (map
    (fn [item]
      {:title ($x:text "./title" item)
       :link  ($x:text "./link" item)})
    (take 5
          ($x "/rss/channel/item" (xmldoc)))))

  ;; ({:title "Thank HN: Our friend is Safe and Sound"
  ;;   :link
  ;;   "http://jacquesmattheij.com/thank-hn-our-friend-is-safe-and-sound"}
  ;;  {:title "Entrepreneurshit"
  ;;   :link
  ;;   "http://www.bothsidesofthetable.com/2012/11/18/entrepreneurshit-the-blog-post-on-what-its-really-like/?awesm=bothsid.es_i2G&utm_source=t.co&utm_content=awesm-publisher&utm_medium=bothsid.es-twitter&utm_campaign="}
  ;;  {:title "The British Ruby Conference has been cancelled"
  ;;   :link "http://2013.britruby.com"}
  ;;  {:title "Man Arrested At Airport for Unusual Watch"
  ;;   :link
  ;;   "http://depletedcranium.com/man-arrested-at-airport-for-unusual-watch/"}
  ;;  {:title
  ;;   "Textadept: fast, minimalist, and Lua-extensible cross-platform text editor"
  ;;   :link "http://foicica.com/textadept"})



  ;; That concludes the overview of clj-xpath's main features.
  )