Wookiepedia Item Scraper
ActivePublic

Authored by kqr on Jan 17 2019, 10:41 AM.
open System.IO
open System.Text.RegularExpressions
open FSharp.Data
open Newtonsoft.Json
let baseUrl = System.Uri("http://starwars.wikia.com")
let canonArticles = "/wiki/Category:Canon_articles"
let downloadSlowly (path : string) =
Async.RunSynchronously (async { do! Async.Sleep 250 })
HtmlDocument.Load(System.Uri(baseUrl, path).ToString())
let optionPair (pair : option<'T> * option<'U>) =
match pair with
| (Some a, Some b) -> Some (a, b)
| _ -> None
type InformationEntry =
| PlainText of string
| EntryList of seq<string>
type InformationEntryJsonConverter() =
inherit JsonConverter<InformationEntry>()
override this.ReadJson(reader, t, info, b, serializer) = failwith "stub"
override this.WriteJson(writer : JsonWriter, info : InformationEntry, serializer : JsonSerializer) =
match info with
| PlainText s -> writer.WriteValue(s)
| EntryList l -> serializer.Serialize(writer, l)
let plainText (node : HtmlNode) =
Regex.Replace(node.InnerText(), "\[[0-9]+\]", "")
let plainTextOrList (node : HtmlNode) =
if node.Elements("ul").IsEmpty then
PlainText (plainText node)
else
node.Descendants("li") |> Seq.map plainText |> EntryList
let tryGetCategory (htmlClasses : HtmlAttribute) =
let tryGetCategory (htmlClass : string) =
let matching = Regex.Match(htmlClass, "pi-theme-([^ ]+)")
if matching.Success then Some (matching.Groups.[1].Value) else None
htmlClasses.Value().Split(' ') |> Seq.tryPick tryGetCategory
let category (page : HtmlDocument) =
optionPair
( Some "__category__"
, page.Descendants("aside")
|> Seq.filter (HtmlNode.hasClass "portable-infobox")
|> Seq.tryPick (HtmlNode.tryGetAttribute "class")
|> Option.bind tryGetCategory
|> Option.map PlainText
)
let name (page : HtmlDocument) =
optionPair
( Some "__name__"
, page.Descendants()
|> Seq.filter (HtmlNode.hasAttribute "data-source" "name")
|> Seq.filter (HtmlNode.hasClass "pi-title")
|> Seq.map (plainText >> PlainText)
|> Seq.tryPick Some
)
let image (page : HtmlDocument) =
optionPair
( Some "__image__"
, page.Descendants()
|> Seq.filter (HtmlNode.hasClass "pi-image-thumbnail")
|> Seq.tryPick Some
|> Option.bind (HtmlNode.tryGetAttribute "src")
|> Option.map (HtmlAttribute.value >> PlainText)
)
let information (div : HtmlNode) =
optionPair
( div.Descendants()
|> Seq.filter (HtmlNode.hasClass "pi-data-label")
|> Seq.map plainText
|> Seq.tryPick Some
, div.Descendants()
|> Seq.filter (HtmlNode.hasClass "pi-data-value")
|> Seq.map plainTextOrList
|> Seq.tryPick Some
)
let description (page : HtmlDocument) =
optionPair
( Some "__description__"
, page.Descendants()
|> Seq.filter (HtmlNode.hasClass "mw-content-text")
|> Seq.map (HtmlNode.elementsNamed [| "p" |])
|> Seq.concat
|> Seq.map (plainText >> PlainText)
|> Seq.tryPick Some
)
let allDataPairs (page : HtmlDocument) =
seq {
yield! page |> Seq.singleton |> Seq.choose category
yield! page |> Seq.singleton |> Seq.choose name
yield! page |> Seq.singleton |> Seq.choose image
yield! page.Descendants()
|> Seq.filter (HtmlNode.hasClass "pi-item")
|> Seq.choose information
yield! page |> Seq.singleton |> Seq.choose description
}
let rec findAll (listingUrl : string) =
let page = downloadSlowly listingUrl
seq {
yield! page.Descendants()
|> Seq.filter (HtmlNode.hasClass "category-page__member-link")
|> Seq.choose (HtmlNode.tryGetAttribute "href")
|> Seq.map (HtmlAttribute.value)
yield! page.Descendants()
|> Seq.filter (HtmlNode.hasClass "category-page__pagination-next")
|> Seq.tryPick Some
|> Option.bind (HtmlNode.tryGetAttribute "href")
|> Option.map (HtmlAttribute.value >> findAll)
|> defaultArg <| Seq.empty
}
let jsonStream (stream : StreamWriter) (value : seq<Map<string, InformationEntry>>) =
let json = JsonSerializer()
json.Converters.Add(InformationEntryJsonConverter())
use writer = new JsonTextWriter(stream) :> JsonWriter
json.Serialize(writer, value)
[<EntryPoint>]
let main argv =
use stdout = new StreamWriter(System.Console.OpenStandardOutput())
stdout.AutoFlush <- true
System.Console.SetOut(stdout)
findAll canonArticles
|> Seq.map (downloadSlowly >> allDataPairs >> Map.ofSeq)
|> jsonStream stdout
0
kqr created this paste.Jan 17 2019, 10:41 AM
kqr created this object with edit policy "No One".
kqr edited the content of this paste. (Show Details)Jan 18 2019, 8:53 AM