Introduction to the wikitaxa package

Scott Chamberlain

2017-05-05

wikitaxa - Taxonomy data from Wikipedia

The goal of wikitaxa is to allow search and taxonomic data retrieval from across many Wikimedia sites, including: Wikipedia, Wikicommons, and Wikispecies.

There are lower level and higher level parts to the package API:

Low level API

The low level API is meant for power users and gives you more control, but requires more knowledge.

High level API

The high level API is meant to be easier and faster to use.

Search functions:

Installation

CRAN version

install.packages("wikitaxa")

Dev version

devtools::install_github("ropensci/wikitaxa")
library("wikitaxa")

wiki data

wt_data("Poa annua")

Get a Wikidata ID

wt_data_id("Mimulus foliatus")
#> [1] "Q6495130"
#> attr(,"class")
#> [1] "wiki_id"

wikipedia

lower level

pg <- wt_wiki_page("https://en.wikipedia.org/wiki/Malus_domestica")
res <- wt_wiki_page_parse(pg)
res$iwlinks
#> [1] "https://en.wiktionary.org/wiki/apple"                                  
#> [2] "https://commons.wikimedia.org/wiki/Special:Search/Apple"               
#> [3] "https://en.wikiquote.org/wiki/Apples"                                  
#> [4] "https://en.wikisource.org/wiki/1911_Encyclop%C3%A6dia_Britannica/Apple"
#> [5] "https://en.wikibooks.org/wiki/Apples"                                  
#> [6] "https://species.wikimedia.org/wiki/Malus_domestica"                    
#> [7] "https://commons.wikimedia.org/wiki/Category:Apple_cultivars"

higher level

res <- wt_wikipedia("Malus domestica")
res$common_names
#> # A tibble: 3 × 2
#>         name language
#>        <chr>    <chr>
#> 1 apple tree       en
#> 2      apple       en
#> 3      Apple       en
res$classification
#> # A tibble: 9 × 2
#>       rank         name
#>      <chr>        <chr>
#> 1  kingdom      Plantae
#> 2 unranked  Angiosperms
#> 3 unranked     Eudicots
#> 4 unranked       Rosids
#> 5    order      Rosales
#> 6   family     Rosaceae
#> 7    genus        Malus
#> 8  species    M. pumila
#> 9 binomial Malus pumila

choose a wikipedia language

# French
wt_wikipedia(name = "Malus domestica", wiki = "fr")
# Slovak
wt_wikipedia(name = "Malus domestica", wiki = "sk")
# Vietnamese
wt_wikipedia(name = "Malus domestica", wiki = "vi")

search

wt_wikipedia_search(query = "Pinus")
#> $batchcomplete
#> [1] ""
#> 
#> $continue
#> $continue$sroffset
#> [1] 10
#> 
#> $continue$continue
#> [1] "-||"
#> 
#> 
#> $query
#> $query$searchinfo
#> $query$searchinfo$totalhits
#> [1] 2804
#> 
#> 
#> $query$search
#> # A tibble: 10 × 6
#>       ns                 title  size wordcount
#> *  <int>                 <chr> <int>     <int>
#> 1      0                  Pine 19915      2372
#> 2      0 List of Pinus species 13999       995
#> 3      0      Pinus luchuensis  2903       166
#> 4      0     Pinus wallichiana  4295       433
#> 5      0           Pinus nigra 11468      1352
#> 6      0          Pinus kesiya  5281       512
#> 7      0       Pinus devoniana  3801       397
#> 8      0  Pinus × sondereggeri  3485       347
#> 9      0            Pinus mugo 10884       795
#> 10     0     Pinus heldreichii  6482       707
#> # ... with 2 more variables: snippet <chr>, timestamp <chr>

search supports languages

wt_wikipedia_search(query = "Pinus", wiki = "fr")

wikicommons

lower level

pg <- wt_wiki_page("https://commons.wikimedia.org/wiki/Abelmoschus")
res <- wt_wikicommons_parse(pg)
res$common_names[1:3]
#> [[1]]
#> [[1]]$name
#> [1] "okra"
#> 
#> [[1]]$language
#> [1] "en"
#> 
#> 
#> [[2]]
#> [[2]]$name
#> [1] "مسكي"
#> 
#> [[2]]$language
#> [1] "ar"
#> 
#> 
#> [[3]]
#> [[3]]$name
#> [1] "Abelmoş"
#> 
#> [[3]]$language
#> [1] "az"

higher level

res <- wt_wikicommons("Abelmoschus")
res$classification
#> # A tibble: 15 × 2
#>            rank           name
#>           <chr>          <chr>
#> 1        Domain      Eukaryota
#> 2    • unranked Archaeplastida
#> 3      • Regnum        Plantae
#> 4      • Cladus    angiosperms
#> 5      • Cladus       eudicots
#> 6      • Cladus  core eudicots
#> 7      • Cladus    superrosids
#> 8      • Cladus         rosids
#> 9      • Cladus    eurosids II
#> 10       • Ordo       Malvales
#> 11    • Familia      Malvaceae
#> 12 • Subfamilia     Malvoideae
#> 13     • Tribus      Hibisceae
#> 14            •    Abelmoschus
#> 15               Medik. (1787)
res$common_names
#> # A tibble: 18 × 2
#>             name language
#>            <chr>    <chr>
#> 1           okra       en
#> 2           مسكي       ar
#> 3        Abelmoş       az
#> 4      Ibiškovec       cs
#> 5   Bisameibisch       de
#> 6          Okrat       fi
#> 7      Abelmosco       gl
#> 8      Abelmošus       hr
#> 9         Ybiškė       lt
#> 10  അബെ\u0d7dമോസ്കസ്       ml
#> 11      Абельмош      mrj
#> 12       Piżmian       pl
#> 13      Абельмош       ru
#> 14          موري       sd
#> 15   Okrasläktet       sv
#> 16      Абельмош      udm
#> 17 Chi Vông vang       vi
#> 18        黄葵属       zh

search

wt_wikicommons_search(query = "Pinus")
#> $batchcomplete
#> [1] ""
#> 
#> $continue
#> $continue$sroffset
#> [1] 10
#> 
#> $continue$continue
#> [1] "-||"
#> 
#> 
#> $query
#> $query$searchinfo
#> $query$searchinfo$totalhits
#> [1] 257
#> 
#> 
#> $query$search
#> # A tibble: 10 × 6
#>       ns                                    title  size wordcount
#> *  <int>                                    <chr> <int>     <int>
#> 1      0                                    Pinus  4160       303
#> 2      0                              Pinus nigra  7449       486
#> 3      0                       Pinus × schwerinii   634        67
#> 4      0                               Pinus mugo  7157       573
#> 5      0                             Spinus pinus  1563       242
#> 6      0                       Pinus tabuliformis  1739       136
#> 7      0                          Setophaga pinus  1735       198
#> 8      0                          Pinus sabiniana  2799       217
#> 9      0 Pinus distribution maps of North America 25971        92
#> 10     0                            Pinus cooperi   564        64
#> # ... with 2 more variables: snippet <chr>, timestamp <chr>

wikispecies

lower level

pg <- wt_wiki_page("https://species.wikimedia.org/wiki/Malus_domestica")
res <- wt_wikispecies_parse(pg, types = "common_names")
res$common_names[1:3]
#> [[1]]
#> [[1]]$name
#> [1] "Ябълка"
#> 
#> [[1]]$language
#> [1] "български"
#> 
#> 
#> [[2]]
#> [[2]]$name
#> [1] "Poma, pomera"
#> 
#> [[2]]$language
#> [1] "català"
#> 
#> 
#> [[3]]
#> [[3]]$name
#> [1] "Apfel"
#> 
#> [[3]]$language
#> [1] "Deutsch"

higher level

res <- wt_wikispecies("Malus domestica")
res$classification
#> # A tibble: 8 × 2
#>          rank          name
#>         <chr>         <chr>
#> 1 Superregnum     Eukaryota
#> 2      Regnum       Plantae
#> 3      Cladus   Angiosperms
#> 4      Cladus      Eudicots
#> 5      Cladus Core eudicots
#> 6      Cladus        Rosids
#> 7      Cladus    Eurosids I
#> 8        Ordo       Rosales
res$common_names
#> # A tibble: 19 × 2
#>               name   language
#>              <chr>      <chr>
#> 1           Ябълка  български
#> 2     Poma, pomera     català
#> 3            Apfel    Deutsch
#> 4      Aed-õunapuu      eesti
#> 5            Μηλιά   Ελληνικά
#> 6            Apple    English
#> 7          Manzano    español
#> 8            Pomme   français
#> 9            Melâr     furlan
#> 10        사과나무     한국어
#> 11          ‘Āpala    Hawaiʻi
#> 12            Melo   italiano
#> 13           Aapel Nordfriisk
#> 14  Maçã, Macieira  português
#> 15 Яблоня домашняя    русский
#> 16   Tarhaomenapuu      suomi
#> 17            Elma     Türkçe
#> 18  Яблуня домашня українська
#> 19          Pomaro     vèneto

search

wt_wikispecies_search(query = "Pinus")
#> $batchcomplete
#> [1] ""
#> 
#> $continue
#> $continue$sroffset
#> [1] 10
#> 
#> $continue$continue
#> [1] "-||"
#> 
#> 
#> $query
#> $query$searchinfo
#> $query$searchinfo$totalhits
#> [1] 396
#> 
#> 
#> $query$search
#> # A tibble: 10 × 6
#>       ns                    title  size wordcount
#> *  <int>                    <chr> <int>     <int>
#> 1      0                    Pinus  1570       282
#> 2      0        Pinus subg. Pinus   318        27
#> 3      0             Pinus clausa  1183       211
#> 4      0        Pinus sect. Pinus   623        68
#> 5      0           Pinus resinosa  1195       166
#> 6      0 Pinus nigra subsp. nigra  1412       127
#> 7      0            Pinus cooperi   680        89
#> 8      0         Pinus thunbergii   873       122
#> 9      0         Pinus gordoniana   594        61
#> 10     0     Pinus subsect. Pinus   718        94
#> # ... with 2 more variables: snippet <chr>, timestamp <chr>