add collapse_vcdb function

onlyphantom · Mar 11, 2019 · 777c247 · 777c247
1 parent f642d56
commit 777c247
Show file tree

Hide file tree

Showing 20 changed files with 437 additions and 14 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: verisr2
 Type: Package
 Title: Convenience functions for exploratory analysis on VERIS database
-Version: 0.1.0
+Version: 0.2.0
 Authors@R: person("Samuel", "Chan", email = "[email protected]", role = c("aut", "cre"))
 Description: Small helper functions for working with the typically-wide data frame
     objects from the VERIS Community Database (VCDB).

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(collapse_vcdb)
 export(enum2grid)
 export(getenum_df)
 export(getenum_stri)

diff --git a/R/collapse.R b/R/collapse.R
@@ -0,0 +1,167 @@
+#' Extract the name
+#'
+#' Extract the value (name) from the original representation of an enumeration.
+#'
+#' @param string Character or vector of characters representing one or more fields of interest (enumeration).
+#' @return a character string
+#' @examples
+#' extract_names("action.hacking.result.Exfiltrate")
+
+extract_names <- function(string) regmatches(string, regexpr(pattern="[[:upper:]]+[A-z ]+[[:alnum:]]", text=string))
+
+#' Find the largest values
+#'
+#' For each row, find a value that represent the enumeration using \code{max.col}. An internal function that
+#' is used in \code{determine_primary} and \code{determine_primaryl}.
+#'
+#' @param data A \code{data frame} object, where all columns
+#' @return a character vector of the values that are largest in its enumeration group
+#' @examples
+#' find_largest(vcdb[1:10,57:105])
+
+find_largest <- function(data) extract_names(colnames(data)[max.col(data, ties.method="first")])
+
+#' Determine the primary value from each enumeration group (numeric)
+#'
+#' For each row, find a value that represent the enumeration. If all values in the group is NA, replace
+#' with "Unknown" variable
+#'
+#' @inheritParams getenum_tbl
+#' @return a character vector of the values that represent that enumeration for each numeric observation
+#' @examples
+#' determine_primary(vcdb, "action.hacking.variety")
+#' table(determine_primary(vcdb, c("action.hacking.result", "action.hacking.variety")))
+
+determine_primary <- function(data, string){
+  y <- getenum_stri(data, string)
+  return(ifelse(rowSums(data[,y]) == 0, "Unknown", find_largest(data[,y])))
+}
+
+#' Determine the primary value from each enumeration group (logical)
+#'
+#' For each row, find a value that represent the enumeration. If all values in the group is NA, replace
+#' with "Unknown" variable
+#'
+#' @inheritParams getenum_stri
+#' @return a character vector of the values that represent that enumeration for each logical observation
+#' @examples
+#' determine_primaryl(vcdb, "action.error.variety")
+
+determine_primaryl <- function(data, string){
+  result <- vector(mode="character", length=nrow(data))
+  y <- getenum_stri(data, string)
+  result <- find_largest(data[,y])
+  inter <- apply(data[, y], MARGIN=1, FUN=sum)
+  result[inter == 0] <- "Unknown"
+  result[inter > 1] <- "Multiple"
+
+  return(result)
+}
+
+#' Determine the value for impact.overall_amount
+#'
+#' For each row, find a value that represent the impact.
+#'
+#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
+#' @return a data frame
+#' @examples
+#' determine_primary(vcdb)
+
+determine_impact <- function(data){
+  cond1 <- data$impact.overall_amount == 0 &
+    (data$impact.overall_max_amount +
+       data$impact.overall_max_amount > 0 )
+
+  data[cond1, "impact.overall_amount"] <- round(rowMeans(data[cond1, c("impact.overall_max_amount", "impact.overall_min_amount")]),0)
+  return(data)
+}
+
+#' Extract all enumerations from a data frame
+#'
+#' Returns a character vector containing the names of all enumerations
+#'
+#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
+#' @return a character vector containing the names of all enumerations
+#' @examples
+#' extract_enums(vcdb)
+
+extract_enums <- function(vcdb) unique(regmatches(colnames(vcdb),
+                                        regexpr(pattern="[a-z._0-9]*(?=\\.[A-Z]+)",
+                                                        text=colnames(vcdb), perl = TRUE)))
+
+#' Process all logical enumerations in a data frame
+#'
+#' Returns a collapsed data frame by selecting a primary value that determine each enumeration
+#' across all logical variables
+#'
+#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
+#' @return a collapsed data frame
+#' @examples
+#' process_log(vcdb)
+
+process_log <- function(vcdb){
+  enum_list <- extract_enums(vcdb)
+  for(i in 1:length(enum_list)){
+    vcdb[,enum_list[i]] <- determine_primaryl(vcdb, enum_list[i])
+  }
+  return(vcdb[, enum_list])
+}
+
+
+#' Collapse the VCDB data frame into a more conventional "tidy" data frame
+#'
+#' Shrink the dimension of a VCDB data frame by using a representative value for each enumeration.
+#' This function results in some loss of fidelity, a reasonable trade-off for the convenience we get
+#' from a "tidy" data frame.
+#'
+#' The function handles logical enumerations (TRUE/FALSE) differently from factor enumerations as
+#' well as numeric enumerations. The resulting data frame (output) contains new variables not in
+#' the original VCDB that stores the "representative" value for each incident across each
+#' enumeration group
+#'
+#' @inheritParams process_log
+#' @return a collapsed data frame more suited for tidyverse-esque EDA tasks
+#' @examples
+#' collapse_vcdb(vcdb)
+#' @export
+
+
+collapse_vcdb <- function(vcdb){
+  facts <- vcdb %>%
+    select_if(is.logical) %>%
+    select_if(~!all(is.na(.))) %>%
+    process_log() %>%
+    rename("pattern_collapsed"=pattern) %>%
+    mutate_if(is.character, as.factor)
+
+  nums <- vcdb %>%
+    select_if(is.numeric) %>%
+    select_if(~!all(is.na(.))) %>%
+    replace(is.na(.), 0) %>%
+    determine_impact() %>%
+    mutate(
+      asset.primary_asset = as.factor(determine_primary(., "asset.assets.amount")),
+      attribute.confidentiality.primary_attribute = as.factor(determine_primary(., "attribute.confidentiality.data.amount"))
+    ) %>%
+    select(
+      "asset.primary_asset", "asset.total_amount", "attribute.availability.duration.value", "attribute.confidentiality.data_total", "impact.overall_amount", "timeline.compromise.value", "timeline.discovery.value", "timeline.exfiltration.value", "victim.locations_affected", "victim.revenue.amount", "victim.secondary.amount", "asset.primary_asset", "attribute.confidentiality.primary_attribute"
+    )
+
+  x <- vcdb %>%
+    mutate(timeline.incident.year = as.factor(timeline.incident.year),
+           timeline.incident.month = as.factor(timeline.incident.month),
+           timeline.incident.day = as.factor(timeline.incident.day),
+           plus.dbir_year = as.factor(plus.dbir_year),
+           plus.timeline.notification.day = as.factor(plus.timeline.notification.day),
+           plus.timeline.notification.month = as.factor(plus.timeline.notification.month),
+           plus.timeline.notification.year = as.factor(plus.timeline.notification.year)) %>%
+    select_if(function(col) is.character(col) || is.factor(col))
+
+
+  vcdb_collapsed <- cbind(facts, nums, x) %>%
+    select_if(~!all(is.na(.))) %>%
+    select(sort(current_vars()))
+  return(vcdb_collapsed)
+}
+
+
diff --git a/R/verisr2.R b/R/verisr2.R
@@ -16,14 +16,14 @@ NULL
   packageStartupMessage("Welcome to verisr2. This package is written to add or replace functionalities broken in the old veris package by Jay Jacobs which included many legacy code that has been deprecated. Please file issues on GitHub.")
 }
 
-#' Find all variables in a VCDB data frame from a specified prefix
+#' Find all variables in a VCDB data frame from a specified enumeration
 #'
 #' Find all fields from the data frame where its field name
 #' is immediately preceded by the specified string.
 #'
 #' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
-#' @param string Character or vector of characters representing one or more fields of interest (prefix).
-#' @return a character vector of all variables in the VCDB dataframe that has the speficied prefix
+#' @param string Character or vector of characters representing one or more fields of interest (enumeration).
+#' @return a character vector of all variables in the VCDB dataframe that has the speficied enumeration
 #' @examples
 #' getenum_stri(vcdb, "action.error.vector")
 #' getenum_stri (vcdb, "actor")
@@ -47,7 +47,7 @@ getenum_stri <- function(data, string){
 #' is immediately preceded by the specified string.
 #'
 #' @param data A \code{data frame} object, typically converted from the VCDB JSON format
-#' @param params Character or vector of characters representing one or more fields of interest (prefix)
+#' @param params Character or vector of characters representing one or more fields of interest (enumeration)
 #' @return a frequency table from enumerating on the specified fields of interests
 #' @examples
 #' getenum_tbl(vcdb, c("actor"))
@@ -71,7 +71,7 @@ getenum_tbl <- function(data, params){
 #' @import dplyr
 #' @import tidyr
 #' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
-#' @param params Character representing the fieldsof interest (prefix).
+#' @param params Character representing the fields of interest (enumeration).
 #' @inheritParams getenum_tbl
 #' @return a data frame with count and frequency from enumerating on one field.
 #' @examples

diff --git a/README.md b/README.md
@@ -168,10 +168,93 @@ vcdb_small <- importveris("~/Datasets/vcdb_small/")
 ```
 
     ## [1] "veris dimensions"
-    ## [1]    3 2437
+    ## [1]    0 2437
     ## named integer(0)
     ## named integer(0)
 
+Transform VCDB to a tidyverse-esque data frame
+----------------------------------------------
+
+`collapse_vcdb()` takes a `vcdb` data frame and turns it into a more
+compact data frame that conforms to the “tidyverse” specifications. New
+features are created from the original data, using values that best
+represent each enumeration. An oversimplified diagram explaining this
+process is as follow: ![](README_files/collapse.png)
+
+``` r
+tidy_vcdb <- collapse_vcdb(vcdb)
+str(tidy_vcdb)
+```
+
+    ## Loading verisr2
+
+    ## Welcome to verisr2. This package is written to add or replace functionalities broken in the old veris package by Jay Jacobs which included many legacy code that has been deprecated. Please file issues on GitHub.
+
+    ## 'data.frame':    8198 obs. of  15 variables:
+    ##  $ action                      : Factor w/ 9 levels "Environmental",..: 5 7 2 3 2 2 3 3 7 6 ...
+    ##  $ action.environmental.notes  : chr  NA NA NA NA ...
+    ##  $ action.environmental.variety: Factor w/ 4 levels "Fire","Humidity",..: 4 4 4 4 4 4 4 4 4 4 ...
+    ##  $ action.error.notes          : chr  NA NA NA NA ...
+    ##  $ action.error.variety        : Factor w/ 18 levels "Capacity shortage",..: 18 18 6 18 10 10 18 18 18 18 ...
+    ##  $ action.error.vector         : Factor w/ 8 levels "Carelessness",..: 8 8 8 8 1 1 8 8 8 8 ...
+    ##  $ action.hacking.cve          : chr  NA NA NA NA ...
+    ##  $ action.hacking.notes        : chr  NA NA NA NA ...
+    ##  $ action.hacking.result       : Factor w/ 5 levels "Elevate","Exfiltrate",..: 5 5 5 5 5 5 5 5 5 5 ...
+    ##  $ action.hacking.variety      : Factor w/ 8 levels "Brute force",..: 6 6 6 5 6 6 6 6 6 6 ...
+    ##  $ action.hacking.vector       : Factor w/ 11 levels "Backdoor or C2",..: 9 9 9 11 9 9 11 11 9 9 ...
+    ##  $ action.malware.cve          : chr  NA NA NA NA ...
+    ##  $ action.malware.name         : chr  NA NA NA NA ...
+    ##  $ action.malware.notes        : chr  NA NA NA NA ...
+    ##  $ action.malware.result       : Factor w/ 5 levels "Elevate","Exfiltrate",..: 5 5 5 5 5 5 5 5 5 5 ...
+
+Note that the new data frame is a lot more compact, with 175 instead of
+the original 2,430+ variables:
+
+``` r
+dim(tidy_vcdb)
+```
+
+    ## [1] 8198  175
+
+Where the original VCDB has a shape that resembles a “sparse matrix”,
+this new “tidy” data frame now has most variables as factor and numeric
+values. Obviously some loss of fidelity happens (a 2500-column data
+matrix where most values are 0 are reduced to 175-column where only the
+representative value is stored in each dimension / enumeration):
+
+    ## 
+    ## c("ordered", "factor")              character                 factor 
+    ##                      1                     59                    105 
+    ##                numeric 
+    ##                     10
+
+Combining with `ggplot2`
+------------------------
+
+The data (both the originalo `vcdb` and its tidy variant) also works
+well with the rest of `tidyverse`. An example is to use the data in
+conjunction with `dplyr` and `ggplot2`:
+
+``` r
+vcdb %>%
+  group_by(attribute.confidentiality.data_disclosure.Yes) %>%
+  dplyr::count(timeline.incident.year) %>%
+  ungroup() %>% 
+  mutate(
+    breach = ifelse(attribute.confidentiality.data_disclosure.Yes, 
+                    "Breach", "Incident")
+  ) %>% filter(
+    timeline.incident.year > 2000
+  ) %>% ggplot(aes(x=timeline.incident.year, y=n, group=breach)) +
+  geom_col(aes(fill=breach), position = "dodge") +
+  scale_x_continuous(expand=c(0,0), breaks=seq(2000, 2018, 3)) + 
+  scale_y_continuous(expand=c(0,0)) + 
+  scale_fill_brewer(palette = 11) + 
+  labs(title="VCDB Confidentiality Breaches", caption="Confidentiality breaches where data disclosure occured"
+```
+
+![](README_files/figure-markdown_github/unnamed-chunk-18-1.png)
+
 Credits
 -------
 

diff --git a/README_files/collapse.png b/README_files/collapse.png
diff --git a/README_files/figure-markdown_github/unnamed-chunk-18-1.png b/README_files/figure-markdown_github/unnamed-chunk-18-1.png
diff --git a/man/collapse_vcdb.Rd b/man/collapse_vcdb.Rd
diff --git a/man/determine_impact.Rd b/man/determine_impact.Rd
diff --git a/man/determine_primary.Rd b/man/determine_primary.Rd