Skip to content

Commit

Permalink
add collapse_vcdb function
Browse files Browse the repository at this point in the history
  • Loading branch information
onlyphantom committed Mar 11, 2019
1 parent f642d56 commit 777c247
Show file tree
Hide file tree
Showing 20 changed files with 437 additions and 14 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: verisr2
Type: Package
Title: Convenience functions for exploratory analysis on VERIS database
Version: 0.1.0
Version: 0.2.0
Authors@R: person("Samuel", "Chan", email = "[email protected]", role = c("aut", "cre"))
Description: Small helper functions for working with the typically-wide data frame
objects from the VERIS Community Database (VCDB).
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(collapse_vcdb)
export(enum2grid)
export(getenum_df)
export(getenum_stri)
Expand Down
167 changes: 167 additions & 0 deletions R/collapse.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#' Extract the name
#'
#' Extract the value (name) from the original representation of an enumeration.
#'
#' @param string Character or vector of characters representing one or more fields of interest (enumeration).
#' @return a character string
#' @examples
#' extract_names("action.hacking.result.Exfiltrate")

extract_names <- function(string) regmatches(string, regexpr(pattern="[[:upper:]]+[A-z ]+[[:alnum:]]", text=string))

#' Find the largest values
#'
#' For each row, find a value that represent the enumeration using \code{max.col}. An internal function that
#' is used in \code{determine_primary} and \code{determine_primaryl}.
#'
#' @param data A \code{data frame} object, where all columns
#' @return a character vector of the values that are largest in its enumeration group
#' @examples
#' find_largest(vcdb[1:10,57:105])

find_largest <- function(data) extract_names(colnames(data)[max.col(data, ties.method="first")])

#' Determine the primary value from each enumeration group (numeric)
#'
#' For each row, find a value that represent the enumeration. If all values in the group is NA, replace
#' with "Unknown" variable
#'
#' @inheritParams getenum_tbl
#' @return a character vector of the values that represent that enumeration for each numeric observation
#' @examples
#' determine_primary(vcdb, "action.hacking.variety")
#' table(determine_primary(vcdb, c("action.hacking.result", "action.hacking.variety")))

determine_primary <- function(data, string){
y <- getenum_stri(data, string)
return(ifelse(rowSums(data[,y]) == 0, "Unknown", find_largest(data[,y])))
}

#' Determine the primary value from each enumeration group (logical)
#'
#' For each row, find a value that represent the enumeration. If all values in the group is NA, replace
#' with "Unknown" variable
#'
#' @inheritParams getenum_stri
#' @return a character vector of the values that represent that enumeration for each logical observation
#' @examples
#' determine_primaryl(vcdb, "action.error.variety")

determine_primaryl <- function(data, string){
result <- vector(mode="character", length=nrow(data))
y <- getenum_stri(data, string)
result <- find_largest(data[,y])
inter <- apply(data[, y], MARGIN=1, FUN=sum)
result[inter == 0] <- "Unknown"
result[inter > 1] <- "Multiple"

return(result)
}

#' Determine the value for impact.overall_amount
#'
#' For each row, find a value that represent the impact.
#'
#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
#' @return a data frame
#' @examples
#' determine_primary(vcdb)

determine_impact <- function(data){
cond1 <- data$impact.overall_amount == 0 &
(data$impact.overall_max_amount +
data$impact.overall_max_amount > 0 )

data[cond1, "impact.overall_amount"] <- round(rowMeans(data[cond1, c("impact.overall_max_amount", "impact.overall_min_amount")]),0)
return(data)
}

#' Extract all enumerations from a data frame
#'
#' Returns a character vector containing the names of all enumerations
#'
#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
#' @return a character vector containing the names of all enumerations
#' @examples
#' extract_enums(vcdb)

extract_enums <- function(vcdb) unique(regmatches(colnames(vcdb),
regexpr(pattern="[a-z._0-9]*(?=\\.[A-Z]+)",
text=colnames(vcdb), perl = TRUE)))

#' Process all logical enumerations in a data frame
#'
#' Returns a collapsed data frame by selecting a primary value that determine each enumeration
#' across all logical variables
#'
#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
#' @return a collapsed data frame
#' @examples
#' process_log(vcdb)

process_log <- function(vcdb){
enum_list <- extract_enums(vcdb)
for(i in 1:length(enum_list)){
vcdb[,enum_list[i]] <- determine_primaryl(vcdb, enum_list[i])
}
return(vcdb[, enum_list])
}


#' Collapse the VCDB data frame into a more conventional "tidy" data frame
#'
#' Shrink the dimension of a VCDB data frame by using a representative value for each enumeration.
#' This function results in some loss of fidelity, a reasonable trade-off for the convenience we get
#' from a "tidy" data frame.
#'
#' The function handles logical enumerations (TRUE/FALSE) differently from factor enumerations as
#' well as numeric enumerations. The resulting data frame (output) contains new variables not in
#' the original VCDB that stores the "representative" value for each incident across each
#' enumeration group
#'
#' @inheritParams process_log
#' @return a collapsed data frame more suited for tidyverse-esque EDA tasks
#' @examples
#' collapse_vcdb(vcdb)
#' @export


collapse_vcdb <- function(vcdb){
facts <- vcdb %>%
select_if(is.logical) %>%
select_if(~!all(is.na(.))) %>%
process_log() %>%
rename("pattern_collapsed"=pattern) %>%
mutate_if(is.character, as.factor)

nums <- vcdb %>%
select_if(is.numeric) %>%
select_if(~!all(is.na(.))) %>%
replace(is.na(.), 0) %>%
determine_impact() %>%
mutate(
asset.primary_asset = as.factor(determine_primary(., "asset.assets.amount")),
attribute.confidentiality.primary_attribute = as.factor(determine_primary(., "attribute.confidentiality.data.amount"))
) %>%
select(
"asset.primary_asset", "asset.total_amount", "attribute.availability.duration.value", "attribute.confidentiality.data_total", "impact.overall_amount", "timeline.compromise.value", "timeline.discovery.value", "timeline.exfiltration.value", "victim.locations_affected", "victim.revenue.amount", "victim.secondary.amount", "asset.primary_asset", "attribute.confidentiality.primary_attribute"
)

x <- vcdb %>%
mutate(timeline.incident.year = as.factor(timeline.incident.year),
timeline.incident.month = as.factor(timeline.incident.month),
timeline.incident.day = as.factor(timeline.incident.day),
plus.dbir_year = as.factor(plus.dbir_year),
plus.timeline.notification.day = as.factor(plus.timeline.notification.day),
plus.timeline.notification.month = as.factor(plus.timeline.notification.month),
plus.timeline.notification.year = as.factor(plus.timeline.notification.year)) %>%
select_if(function(col) is.character(col) || is.factor(col))


vcdb_collapsed <- cbind(facts, nums, x) %>%
select_if(~!all(is.na(.))) %>%
select(sort(current_vars()))
return(vcdb_collapsed)
}


10 changes: 5 additions & 5 deletions R/verisr2.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ NULL
packageStartupMessage("Welcome to verisr2. This package is written to add or replace functionalities broken in the old veris package by Jay Jacobs which included many legacy code that has been deprecated. Please file issues on GitHub.")
}

#' Find all variables in a VCDB data frame from a specified prefix
#' Find all variables in a VCDB data frame from a specified enumeration
#'
#' Find all fields from the data frame where its field name
#' is immediately preceded by the specified string.
#'
#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
#' @param string Character or vector of characters representing one or more fields of interest (prefix).
#' @return a character vector of all variables in the VCDB dataframe that has the speficied prefix
#' @param string Character or vector of characters representing one or more fields of interest (enumeration).
#' @return a character vector of all variables in the VCDB dataframe that has the speficied enumeration
#' @examples
#' getenum_stri(vcdb, "action.error.vector")
#' getenum_stri (vcdb, "actor")
Expand All @@ -47,7 +47,7 @@ getenum_stri <- function(data, string){
#' is immediately preceded by the specified string.
#'
#' @param data A \code{data frame} object, typically converted from the VCDB JSON format
#' @param params Character or vector of characters representing one or more fields of interest (prefix)
#' @param params Character or vector of characters representing one or more fields of interest (enumeration)
#' @return a frequency table from enumerating on the specified fields of interests
#' @examples
#' getenum_tbl(vcdb, c("actor"))
Expand All @@ -71,7 +71,7 @@ getenum_tbl <- function(data, params){
#' @import dplyr
#' @import tidyr
#' @param data A \code{data frame} object, typically converted from the VCDB JSON format.
#' @param params Character representing the fieldsof interest (prefix).
#' @param params Character representing the fields of interest (enumeration).
#' @inheritParams getenum_tbl
#' @return a data frame with count and frequency from enumerating on one field.
#' @examples
Expand Down
85 changes: 84 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,93 @@ vcdb_small <- importveris("~/Datasets/vcdb_small/")
```

## [1] "veris dimensions"
## [1] 3 2437
## [1] 0 2437
## named integer(0)
## named integer(0)

Transform VCDB to a tidyverse-esque data frame
----------------------------------------------

`collapse_vcdb()` takes a `vcdb` data frame and turns it into a more
compact data frame that conforms to the “tidyverse” specifications. New
features are created from the original data, using values that best
represent each enumeration. An oversimplified diagram explaining this
process is as follow: ![](README_files/collapse.png)

``` r
tidy_vcdb <- collapse_vcdb(vcdb)
str(tidy_vcdb)
```

## Loading verisr2

## Welcome to verisr2. This package is written to add or replace functionalities broken in the old veris package by Jay Jacobs which included many legacy code that has been deprecated. Please file issues on GitHub.

## 'data.frame': 8198 obs. of 15 variables:
## $ action : Factor w/ 9 levels "Environmental",..: 5 7 2 3 2 2 3 3 7 6 ...
## $ action.environmental.notes : chr NA NA NA NA ...
## $ action.environmental.variety: Factor w/ 4 levels "Fire","Humidity",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ action.error.notes : chr NA NA NA NA ...
## $ action.error.variety : Factor w/ 18 levels "Capacity shortage",..: 18 18 6 18 10 10 18 18 18 18 ...
## $ action.error.vector : Factor w/ 8 levels "Carelessness",..: 8 8 8 8 1 1 8 8 8 8 ...
## $ action.hacking.cve : chr NA NA NA NA ...
## $ action.hacking.notes : chr NA NA NA NA ...
## $ action.hacking.result : Factor w/ 5 levels "Elevate","Exfiltrate",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ action.hacking.variety : Factor w/ 8 levels "Brute force",..: 6 6 6 5 6 6 6 6 6 6 ...
## $ action.hacking.vector : Factor w/ 11 levels "Backdoor or C2",..: 9 9 9 11 9 9 11 11 9 9 ...
## $ action.malware.cve : chr NA NA NA NA ...
## $ action.malware.name : chr NA NA NA NA ...
## $ action.malware.notes : chr NA NA NA NA ...
## $ action.malware.result : Factor w/ 5 levels "Elevate","Exfiltrate",..: 5 5 5 5 5 5 5 5 5 5 ...

Note that the new data frame is a lot more compact, with 175 instead of
the original 2,430+ variables:

``` r
dim(tidy_vcdb)
```

## [1] 8198 175

Where the original VCDB has a shape that resembles a “sparse matrix”,
this new “tidy” data frame now has most variables as factor and numeric
values. Obviously some loss of fidelity happens (a 2500-column data
matrix where most values are 0 are reduced to 175-column where only the
representative value is stored in each dimension / enumeration):

##
## c("ordered", "factor") character factor
## 1 59 105
## numeric
## 10

Combining with `ggplot2`
------------------------

The data (both the originalo `vcdb` and its tidy variant) also works
well with the rest of `tidyverse`. An example is to use the data in
conjunction with `dplyr` and `ggplot2`:

``` r
vcdb %>%
group_by(attribute.confidentiality.data_disclosure.Yes) %>%
dplyr::count(timeline.incident.year) %>%
ungroup() %>%
mutate(
breach = ifelse(attribute.confidentiality.data_disclosure.Yes,
"Breach", "Incident")
) %>% filter(
timeline.incident.year > 2000
) %>% ggplot(aes(x=timeline.incident.year, y=n, group=breach)) +
geom_col(aes(fill=breach), position = "dodge") +
scale_x_continuous(expand=c(0,0), breaks=seq(2000, 2018, 3)) +
scale_y_continuous(expand=c(0,0)) +
scale_fill_brewer(palette = 11) +
labs(title="VCDB Confidentiality Breaches", caption="Confidentiality breaches where data disclosure occured"
```

![](README_files/figure-markdown_github/unnamed-chunk-18-1.png)

Credits
-------

Expand Down
Binary file added README_files/collapse.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions man/collapse_vcdb.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions man/determine_impact.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/determine_primary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 777c247

Please sign in to comment.