Introduction to the wikitaxa package

Scott Chamberlain

2017-12-20

wikitaxa - Taxonomy data from Wikipedia

The goal of wikitaxa is to allow search and taxonomic data retrieval from across many Wikimedia sites, including: Wikipedia, Wikicommons, and Wikispecies.

There are lower level and higher level parts to the package API:

Low level API

The low level API is meant for power users and gives you more control, but requires more knowledge.

High level API

The high level API is meant to be easier and faster to use.

Search functions:

Installation

CRAN version

install.packages("wikitaxa")

Dev version

devtools::install_github("ropensci/wikitaxa")
library("wikitaxa")

wiki data

wt_data("Poa annua")

Get a Wikidata ID

wt_data_id("Mimulus foliatus")
#> [1] "Q6495130"
#> attr(,"class")
#> [1] "wiki_id"

wikipedia

lower level

pg <- wt_wiki_page("https://en.wikipedia.org/wiki/Malus_domestica")
res <- wt_wiki_page_parse(pg)
res$iwlinks
#> [1] "https://en.wiktionary.org/wiki/apple"                                  
#> [2] "https://commons.wikimedia.org/wiki/Special:Search/Apple"               
#> [3] "https://en.wikiquote.org/wiki/Apples"                                  
#> [4] "https://en.wikisource.org/wiki/1911_Encyclop%C3%A6dia_Britannica/Apple"
#> [5] "https://en.wikibooks.org/wiki/Apples"                                  
#> [6] "https://species.wikimedia.org/wiki/Malus_domestica"                    
#> [7] "https://commons.wikimedia.org/wiki/Category:Apple_cultivars"

higher level

res <- wt_wikipedia("Malus domestica")
res$common_names
#> # A tibble: 1 x 2
#>    name language
#>   <chr>    <chr>
#> 1 Apple       en
res$classification
#> # A tibble: 3 x 2
#>         rank         name
#>        <chr>        <chr>
#> 1 plainlinks             
#> 2    species    M. pumila
#> 3   binomial Malus pumila

choose a wikipedia language

# French
wt_wikipedia(name = "Malus domestica", wiki = "fr")
# Slovak
wt_wikipedia(name = "Malus domestica", wiki = "sk")
# Vietnamese
wt_wikipedia(name = "Malus domestica", wiki = "vi")

search

wt_wikipedia_search(query = "Pinus")
#> $batchcomplete
#> [1] ""
#> 
#> $continue
#> $continue$sroffset
#> [1] 10
#> 
#> $continue$continue
#> [1] "-||"
#> 
#> 
#> $query
#> $query$searchinfo
#> $query$searchinfo$totalhits
#> [1] 2912
#> 
#> 
#> $query$search
#> # A tibble: 10 x 7
#>       ns                 title  pageid  size wordcount
#>  * <int>                 <chr>   <int> <int>     <int>
#>  1     0                  Pine   39389 21808      2460
#>  2     0 List of Pinus species  448990 14070       984
#>  3     0        Pinus longaeva  649634 12794      1424
#>  4     0       Pinus ponderosa  532941 29851      2644
#>  5     0            Pinus mugo  438946 10733       808
#>  6     0      Bristlecone pine  215931 16321      1679
#>  7     0           Pinus nigra  438963 11476      1352
#>  8     0      Pinus thunbergii 1522846  4679       438
#>  9     0        Pinus contorta  507717 22621      2321
#> 10     0       Pinus sabiniana  427209 13352      1262
#> # ... with 2 more variables: snippet <chr>, timestamp <chr>

search supports languages

wt_wikipedia_search(query = "Pinus", wiki = "fr")

wikicommons

lower level

pg <- wt_wiki_page("https://commons.wikimedia.org/wiki/Abelmoschus")
res <- wt_wikicommons_parse(pg)
res$common_names[1:3]
#> [[1]]
#> [[1]]$name
#> [1] "okra"
#> 
#> [[1]]$language
#> [1] "en"
#> 
#> 
#> [[2]]
#> [[2]]$name
#> [1] "مسكي"
#> 
#> [[2]]$language
#> [1] "ar"
#> 
#> 
#> [[3]]
#> [[3]]$name
#> [1] "Abelmoş"
#> 
#> [[3]]$language
#> [1] "az"

higher level

res <- wt_wikicommons("Abelmoschus")
res$classification
#> # A tibble: 15 x 2
#>          rank           name
#>         <chr>          <chr>
#>  1     Domain      Eukaryota
#>  2   unranked Archaeplastida
#>  3     Regnum        Plantae
#>  4     Cladus    angiosperms
#>  5     Cladus       eudicots
#>  6     Cladus  core eudicots
#>  7     Cladus    superrosids
#>  8     Cladus         rosids
#>  9     Cladus    eurosids II
#> 10       Ordo       Malvales
#> 11    Familia      Malvaceae
#> 12 Subfamilia     Malvoideae
#> 13     Tribus      Hibisceae
#> 14      Genus    Abelmoschus
#> 15  Authority  Medik. (1787)
res$common_names
#> # A tibble: 19 x 2
#>                name language
#>               <chr>    <chr>
#>  1             okra       en
#>  2             مسكي       ar
#>  3          Abelmoş       az
#>  4        Ibiškovec       cs
#>  5     Bisameibisch       de
#>  6            Okrat       fi
#>  7        Abelmosco       gl
#>  8        Abelmošus       hr
#>  9           Ybiškė       lt
#> 10   "അബെ\u0d7dമോസ്കസ്"       ml
#> 11         Абельмош      mrj
#> 12 Abelmoskusslekta       nn
#> 13          Piżmian       pl
#> 14         Абельмош       ru
#> 15             موري       sd
#> 16      Okrasläktet       sv
#> 17         Абельмош      udm
#> 18    Chi Vông vang       vi
#> 19           黄葵属       zh

search

wt_wikicommons_search(query = "Pinus")
#> $batchcomplete
#> [1] ""
#> 
#> $continue
#> $continue$sroffset
#> [1] 10
#> 
#> $continue$continue
#> [1] "-||"
#> 
#> 
#> $query
#> $query$searchinfo
#> $query$searchinfo$totalhits
#> [1] 261
#> 
#> 
#> $query$search
#> # A tibble: 10 x 7
#>       ns                                    title   pageid  size wordcount
#>  * <int>                                    <chr>    <int> <int>     <int>
#>  1     0                                    Pinus    82071  4154       320
#>  2     0                       Pinus × schwerinii 11923249   634        67
#>  3     0                              Pinus nigra    64703  7775       501
#>  4     0                             Spinus pinus   703299  1560       242
#>  5     0                            Pinus cooperi  8853401   564        64
#>  6     0 Pinus distribution maps of North America 29464212 25971        92
#>  7     0                           Pinus herrerae 29975479   206        28
#>  8     0                       Pinus tabuliformis   235899  1739       138
#>  9     0                          Pinus maximinoi 20376092   485        60
#> 10     0                      Pinus pseudostrobus  9972866   756        83
#> # ... with 2 more variables: snippet <chr>, timestamp <chr>

wikispecies

lower level

pg <- wt_wiki_page("https://species.wikimedia.org/wiki/Malus_domestica")
res <- wt_wikispecies_parse(pg, types = "common_names")
res$common_names[1:3]
#> [[1]]
#> [[1]]$name
#> [1] "Ябълка"
#> 
#> [[1]]$language
#> [1] "български"
#> 
#> 
#> [[2]]
#> [[2]]$name
#> [1] "Poma, pomera"
#> 
#> [[2]]$language
#> [1] "català"
#> 
#> 
#> [[3]]
#> [[3]]$name
#> [1] "Apfel"
#> 
#> [[3]]$language
#> [1] "Deutsch"

higher level

res <- wt_wikispecies("Malus domestica")
res$classification
#> # A tibble: 8 x 2
#>          rank          name
#>         <chr>         <chr>
#> 1 Superregnum     Eukaryota
#> 2      Regnum       Plantae
#> 3      Cladus   Angiosperms
#> 4      Cladus      Eudicots
#> 5      Cladus Core eudicots
#> 6      Cladus        Rosids
#> 7      Cladus    Eurosids I
#> 8        Ordo       Rosales
res$common_names
#> # A tibble: 19 x 2
#>               name   language
#>              <chr>      <chr>
#>  1          Ябълка  български
#>  2    Poma, pomera     català
#>  3           Apfel    Deutsch
#>  4     Aed-õunapuu      eesti
#>  5           Μηλιά   Ελληνικά
#>  6           Apple    English
#>  7         Manzano    español
#>  8           Pomme   français
#>  9           Melâr     furlan
#> 10        사과나무     한국어
#> 11          ‘Āpala    Hawaiʻi
#> 12            Melo   italiano
#> 13           Aapel Nordfriisk
#> 14  Maçã, Macieira  português
#> 15 Яблоня домашняя    русский
#> 16   Tarhaomenapuu      suomi
#> 17            Elma     Türkçe
#> 18  Яблуня домашня українська
#> 19          Pomaro     vèneto

search

wt_wikispecies_search(query = "Pinus")
#> $batchcomplete
#> [1] ""
#> 
#> $continue
#> $continue$sroffset
#> [1] 10
#> 
#> $continue$continue
#> [1] "-||"
#> 
#> 
#> $query
#> $query$searchinfo
#> $query$searchinfo$totalhits
#> [1] 400
#> 
#> 
#> $query$search
#> # A tibble: 10 x 7
#>       ns                    title pageid  size wordcount
#>  * <int>                    <chr>  <int> <int>     <int>
#>  1     0                    Pinus  17362  1570       282
#>  2     0 Pinus nigra subsp. nigra 327138  1412       127
#>  3     0        Pinus subg. Pinus 300923   318        27
#>  4     0             Pinus clausa  45047  1520       210
#>  5     0        Pinus sect. Pinus 300935   623        68
#>  6     0           Pinus resinosa  45082  1195       165
#>  7     0         Pinus gordoniana 260795   594        61
#>  8     0     Pinus subsect. Pinus 300938   718        94
#>  9     0         Pinus thunbergii  73542   999       140
#> 10     0          Pinus sabiniana  45084   644        80
#> # ... with 2 more variables: snippet <chr>, timestamp <chr>