Scrape All Pokemon Data

Now that we know how to build the initial Pokedex table and then fill out stats for each Pokemon in the initial table, we can build out the full Pokedex!

Load Libraries

Again, we’ll primarily use the rvest package, along with some other libraries for common functions to help get the Pokedex table.

# Load packages
library(rvest)
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)

Read Initial Pokedex

Just like before, let’s read a table consisting of only Pokemon names, numbers, and the URLs to access each Pokemon’s page.

fetch_pokedex_data <- function() {

  # Set the URL to fetch data from
  url <- "https://pokemondb.net/pokedex/national"

  # Read the body from the page
  body <- url %>% read_html() %>% html_nodes("body")

  # Get the info cards for each Pokemon
  infocards <- html_nodes(body, "span.infocard-lg-data.text-muted")

  # Fetch the Pokemon numbers
  numbers <- infocards %>%
    html_element("small") %>%
    html_text()

  # Fetch the Pokemon names
  names <- infocards %>%
    html_element("a") %>%
    html_text()

  # Fetch the Pokemon URLs
  urls <- infocards %>%
    html_element("a") %>%
    html_attr("href")
  urls <- paste0("https://pokemondb.net", urls)

  # Create tibble for Pokedex
  data_tbl <- tibble(numbers, names, urls) %>%
    rename(Number = numbers, Name = names, URLs = urls)
  data_tbl$Number <- substring(data_tbl$Number, 2) %>% as.numeric()

  return(data_tbl)

}

pokedex_tbl <- fetch_pokedex_data()

Fetch Pokemon Data

Loop through each Pokedex row and append additional information, such as height, battle stats, and evolution data. This will take some time, since we’re scraping all Pokemon data, and waiting one second between function calls to be avoid the naughty list.

fetch_pokemon_data <- function(row) {

  # Fetch the name and URL from the Pokedex data
  name <- row[2]
  url <- row[3]
  print(paste0(name," ",row[1]))

  # Read the body from the page
  body <- url %>% read_html() %>% html_nodes("body")

  # Get the tables with the vital information
  vitals_tables <- html_nodes(body, "table.vitals-table")
  main_table <- html_table(vitals_tables[1])[[1]]
  stats_table <- html_table(vitals_tables[4])[[1]]

  # Function to help us turn the first row of our tibble into the header
  header_from_row <- function(df) {
    names(df) <- as.character(unlist(df[1, ]))
    df[-1, ]
  }

  # Get the types, species, height, and weight from the table
  main_tbl <- main_table %>%
    t %>%
    as_tibble %>%
    header_from_row %>%
    select(c("Type", "Species", "Height", "Weight"))

  # Get stats columns
  stats_tbl <- stats_table %>%
    select(c("X1", "X2")) %>%
    t %>%
    as_tibble %>%
    header_from_row

  # Merge the data
  data_tbl <- cbind(main_tbl, stats_tbl)

  # Clean up the types by breaking them into different columns
  num_types <- data_tbl$Type %>% strsplit(" ") %>% unlist %>% length
  col_names <- paste("Type", c(1:num_types))
  data_tbl <- data_tbl %>%
    separate(col = "Type", into = col_names)

  # Ensure columns are fixed. Types sometimes only has 1, but can be up to 3.
  cols <- c(
    `Type 1` = NA_character_,
    `Type 2` = NA_character_,
    `Type 3` = NA_character_
  )
  data_tbl <- add_column(data_tbl,
                         !!!cols[setdiff(names(cols), names(data_tbl))])

  # Look for evolution information
  evo_node <- html_nodes(body, "div.infocard-list-evo")

  # Check to see if there was any evolution information
  has_evo <- length(evo_node) >= 1

  # If there was evolution information
  if (has_evo) {

    # Get the list of evolutions
    evo_list <- evo_node %>% html_nodes("a.ent-name") %>% html_text

    # Get the maximum number of evolutions for this Pokemon's evolution chain
    max_evo <- length(unique(evo_list))

    # Find out where in the evolution chain this Pokemon sits
    evo_place <- which(tolower(evo_list) == tolower(name))[1]

    # Calculate an evolution index, how far to max evolution the Pokemon is
    evo_index <- round(as.double(evo_place) / as.double(max_evo), 2)

    # Otherwise, assume there is not evolution of this Pokemon
  } else {

    # Set the evolution information to NA
    max_evo <- NA_integer_
    evo_place <- NA_integer_
    evo_index <- NA_integer_

  }

  # Append evolution information to the data tibble
  evo_list <- c(
    `Has Evolution` = has_evo,
    `Evolution Place` = evo_place,
    `Maximum Evolution Count` = max_evo,
    `Evolution Index` = evo_index
  )
  evo_tbl <- evo_list %>%
    t %>%
    as_tibble
  data_tbl <- cbind(data_tbl, evo_tbl)

  # Add a sleep timer to not overload the system
  Sys.sleep(1)

  return(data_tbl)

}

# Get Pokemon data
pokemon_tbl <- apply(pokedex_tbl, 1, fetch_pokemon_data) %>%
  bind_rows()

# Merge pokedex_tbl and pokemon_tbl
pokemon_tbl <- cbind(pokedex_tbl, pokemon_tbl)
Number Name URLs Type 1 Type 2 Species Height Weight HP Attack Defense Sp. Atk Sp. Def Speed Total Type 3 Has Evolution Evolution Place Maximum Evolution Count Evolution Index
1 Bulbasaur https://pokemondb.net/pokedex/bulbasaur Grass Poison Seed Pokémon 0.7 m (2′04″) 6.9 kg (15.2 lbs) 45 49 49 65 65 45 318 NA 1 1 3 0.33
2 Ivysaur https://pokemondb.net/pokedex/ivysaur Grass Poison Seed Pokémon 1.0 m (3′03″) 13.0 kg (28.7 lbs) 60 62 63 80 80 60 405 NA 1 2 3 0.67
3 Venusaur https://pokemondb.net/pokedex/venusaur Grass Poison Seed Pokémon 2.0 m (6′07″) 100.0 kg (220.5 lbs) 80 82 83 100 100 80 525 NA 1 3 3 1.00
4 Charmander https://pokemondb.net/pokedex/charmander Fire NA Lizard Pokémon 0.6 m (2′00″) 8.5 kg (18.7 lbs) 39 52 43 60 50 65 309 NA 1 1 3 0.33
5 Charmeleon https://pokemondb.net/pokedex/charmeleon Fire NA Flame Pokémon 1.1 m (3′07″) 19.0 kg (41.9 lbs) 58 64 58 80 65 80 405 NA 1 2 3 0.67

Clean Data

Clean up the Height and Weight fields to just have the metric units.

clean_pokemon_data <- function(data_tbl) {

  # Clean up Height field to only show meters
  data_tbl$Height <- data_tbl$Height %>%
    str_extract("\\d+\\.*\\d*") %>%
    as.numeric

  # Clean up Weight field to only show kilograms
  data_tbl$Weight <- data_tbl$Weight %>%
    str_extract("\\d+\\.*\\d*") %>%
    as.numeric

  return(data_tbl)

}

pokemon_tbl <- clean_pokemon_data(pokemon_tbl)
Number Name URLs Type 1 Type 2 Species Height Weight HP Attack Defense Sp. Atk Sp. Def Speed Total Type 3 Has Evolution Evolution Place Maximum Evolution Count Evolution Index
1 Bulbasaur https://pokemondb.net/pokedex/bulbasaur Grass Poison Seed Pokémon 0.7 6.9 45 49 49 65 65 45 318 NA 1 1 3 0.33
2 Ivysaur https://pokemondb.net/pokedex/ivysaur Grass Poison Seed Pokémon 1.0 13.0 60 62 63 80 80 60 405 NA 1 2 3 0.67
3 Venusaur https://pokemondb.net/pokedex/venusaur Grass Poison Seed Pokémon 2.0 100.0 80 82 83 100 100 80 525 NA 1 3 3 1.00
4 Charmander https://pokemondb.net/pokedex/charmander Fire NA Lizard Pokémon 0.6 8.5 39 52 43 60 50 65 309 NA 1 1 3 0.33
5 Charmeleon https://pokemondb.net/pokedex/charmeleon Fire NA Flame Pokémon 1.1 19.0 58 64 58 80 65 80 405 NA 1 2 3 0.67

Write Pokedex Data to Output File

Finally, write this data out to a CSV.

# Write data to CSV
write.table(pokemon_tbl, "~/Projects/pokedex/data/pokedex.csv",
            sep = ",", row.names = FALSE)