Skip to content

Commit

Permalink
#2288 Use hash for big data
Browse files Browse the repository at this point in the history
  • Loading branch information
To-om committed Dec 14, 2021
1 parent 4f28902 commit 8a4ac2b
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -630,9 +630,11 @@ class Output @Inject() (
} yield IdMapping(inputLog.metaData.id, log._id)
}

private def getData(value: String)(implicit graph: Graph, authContext: AuthContext): Try[Data with Entity] =
if (observableDataIsIndexed) dataSrv.create(Data(value))
else dataSrv.createEntity(Data(value))
private def getData(value: String)(implicit graph: Graph, authContext: AuthContext): Try[Data with Entity] = {
val (dataOrHash, fullData) = UseHashToIndex.hashToIndex(value).fold[(String, Option[String])](value -> None)(_ -> Some(value))
if (observableDataIsIndexed) dataSrv.create(Data(dataOrHash, fullData))
else dataSrv.createEntity(Data(dataOrHash, fullData))
}

private def createSimpleObservable(observable: Observable, observableType: ObservableType with Entity, dataValue: String)(implicit
graph: Graph,
Expand Down Expand Up @@ -700,7 +702,8 @@ class Output @Inject() (
richObservable <- createObservable(caseId, inputObservable, organisations.map(_._id).toSet)
_ <- reportTagSrv.updateTags(richObservable, inputObservable.reportTags)
case0 <- getCase(caseId)
_ <- organisations.toTry(o => shareSrv.shareObservable(RichObservable(richObservable, None, None, Nil), case0, o._id))
// the data in richObservable is not set because it is not used in shareSrv
_ <- organisations.toTry(o => shareSrv.shareObservable(RichObservable(richObservable, None, None, None, Nil), case0, o._id))
} yield IdMapping(inputObservable.metaData.id, richObservable._id)
}

Expand Down
6 changes: 4 additions & 2 deletions thehive/app/org/thp/thehive/TheHiveModule.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ import com.google.inject.AbstractModule
import net.codingwell.scalaguice.{ScalaModule, ScalaMultibinder}
import org.thp.scalligraph.SingleInstance
import org.thp.scalligraph.auth._
import org.thp.scalligraph.janus.JanusDatabaseProvider
import org.thp.scalligraph.janus.{ImmenseTermProcessor, JanusDatabaseProvider}
import org.thp.scalligraph.models.{Database, UpdatableSchema}
import org.thp.scalligraph.services.{GenIntegrityCheckOps, HadoopStorageSrv, S3StorageSrv}
import org.thp.thehive.controllers.v0.QueryExecutorVersion0Provider
import org.thp.thehive.models.TheHiveSchemaDefinition
import org.thp.thehive.models.{TheHiveSchemaDefinition, UseHashToIndex}
import org.thp.thehive.services.notification.notifiers._
import org.thp.thehive.services.notification.triggers._
import org.thp.thehive.services.{UserSrv => _, _}
Expand Down Expand Up @@ -112,6 +112,8 @@ class TheHiveModule(environment: Environment, configuration: Configuration) exte
bind[ActorRef].annotatedWithName("flow-actor").toProvider[FlowActorProvider]

bind[SingleInstance].to[ClusterSetup].asEagerSingleton()

ImmenseTermProcessor.registerStrategy("observableHashToIndex", _ => UseHashToIndex)
()
}
}
14 changes: 13 additions & 1 deletion thehive/app/org/thp/thehive/controllers/v0/ObservableCtrl.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@ package org.thp.thehive.controllers.v0

import net.lingala.zip4j.ZipFile
import net.lingala.zip4j.model.FileHeader
import org.apache.tinkerpop.gremlin.process.traversal.Compare
import org.thp.scalligraph._
import org.thp.scalligraph.auth.AuthContext
import org.thp.scalligraph.controllers._
import org.thp.scalligraph.models.{Database, Entity, UMapping}
import org.thp.scalligraph.query.PredicateOps.PredicateOpsDefs
import org.thp.scalligraph.query._
import org.thp.scalligraph.traversal.TraversalOps._
import org.thp.scalligraph.traversal.{IteratorOutput, Traversal}
import org.thp.scalligraph.utils.Hasher
import org.thp.thehive.controllers.v0.Conversion._
import org.thp.thehive.dto.v0.{InputAttachment, InputObservable}
import org.thp.thehive.models._
Expand Down Expand Up @@ -417,6 +420,7 @@ class PublicObservable @Inject() (
Query[Traversal.V[Observable], Traversal.V[Case]]("case", (observableSteps, _) => observableSteps.`case`),
Query[Traversal.V[Observable], Traversal.V[Alert]]("alert", (observableSteps, _) => observableSteps.alert)
)
lazy val hasher: Hasher = Hasher("SHA-256")
override val publicProperties: PublicProperties = PublicPropertyListBuilder[Observable]
.property("status", UMapping.string)(_.select(_.constant("Ok")).readonly)
.property("startDate", UMapping.date)(_.select(_._createdAt).readonly)
Expand Down Expand Up @@ -445,7 +449,15 @@ class PublicObservable @Inject() (
_ <- observableSrv.updateType(observable, newDataType)(graph, authContext)
} yield Json.obj("dataType" -> value)
})
.property("data", UMapping.string.optional)(_.field.readonly)
.property("data", UMapping.string.optional)(
_.select(_.value(_.data))
.filter[String] {
case (_, observables, _, Right(predicate)) => observables.has(_.data, predicate.mapValue(v => UseHashToIndex.hashToIndex(v).getOrElse(v)))
case (_, observables, _, Left(true)) => observables.has(_.data)
case (_, observables, _, Left(false)) => observables.hasNot(_.data)
}
.readonly
)
.property("attachment.name", UMapping.string.optional)(_.select(_.attachments.value(_.name)).readonly)
.property("attachment.hashes", UMapping.hash.sequence)(_.select(_.attachments.value(_.hashes)).readonly)
.property("attachment.size", UMapping.long.optional)(_.select(_.attachments.value(_.size)).readonly)
Expand Down
14 changes: 12 additions & 2 deletions thehive/app/org/thp/thehive/controllers/v1/Properties.scala
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package org.thp.thehive.controllers.v1

import org.apache.tinkerpop.gremlin.process.traversal.Compare
import org.apache.tinkerpop.gremlin.structure.T
import org.thp.scalligraph.controllers.{FPathElem, FPathEmpty, FString}
import org.thp.scalligraph.models.{Database, UMapping}
import org.thp.scalligraph.query.PredicateOps._
import org.thp.scalligraph.query.{PublicProperties, PublicPropertyListBuilder}
import org.thp.scalligraph.traversal.TraversalOps._
import org.thp.scalligraph.utils.Hasher
import org.thp.scalligraph.{BadRequestError, EntityId, EntityIdOrName, InvalidFormatAttributeError, RichSeq}
import org.thp.thehive.dto.v1.InputCustomFieldValue
import org.thp.thehive.models._
Expand All @@ -28,7 +30,7 @@ import org.thp.thehive.services._
import play.api.libs.json.{JsObject, JsValue, Json}

import javax.inject.{Inject, Singleton}
import scala.util.{Failure, Success, Try}
import scala.util.{Failure, Success}

@Singleton
class Properties @Inject() (
Expand Down Expand Up @@ -452,7 +454,15 @@ class Properties @Inject() (
_ <- observableSrv.updateType(observable, newDataType)(graph, authContext)
} yield Json.obj("dataType" -> value)
})
.property("data", UMapping.string.optional)(_.field.readonly)
.property("data", UMapping.string.optional)(
_.select(_.value(_.data))
.filter[String] {
case (_, observables, _, Right(predicate)) => observables.has(_.data, predicate.mapValue(v => UseHashToIndex.hashToIndex(v).getOrElse(v)))
case (_, observables, _, Left(true)) => observables.has(_.data)
case (_, observables, _, Left(false)) => observables.hasNot(_.data)
}
.readonly
)
.property("attachment.name", UMapping.string.optional)(_.select(_.attachments.value(_.name)).readonly)
.property("attachment.hashes", UMapping.hash.sequence)(_.select(_.attachments.value(_.hashes)).readonly)
.property("attachment.size", UMapping.long.optional)(_.select(_.attachments.value(_.size)).readonly)
Expand Down
51 changes: 47 additions & 4 deletions thehive/app/org/thp/thehive/models/Observable.scala
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
package org.thp.thehive.models

import org.thp.scalligraph.models.{DefineIndex, Entity, IndexType}
import org.apache.tinkerpop.gremlin.structure.{Vertex, VertexProperty}
import org.thp.scalligraph.janus.{ImmenseStringTermFilter, ImmenseTermProcessor}
import org.thp.scalligraph.models.{DefineIndex, Entity, IndexType, UMapping}
import org.thp.scalligraph.utils.Hasher
import org.thp.scalligraph.{BuildEdgeEntity, BuildVertexEntity, EntityId}

import java.util.Date
import scala.util.Try

@BuildEdgeEntity[Observable, KeyValue]
case class ObservableKeyValue()
Expand Down Expand Up @@ -46,6 +50,7 @@ case class Observable(

case class RichObservable(
observable: Observable with Entity,
fullData: Option[Data with Entity],
attachment: Option[Attachment with Entity],
seen: Option[Boolean],
reportTags: Seq[ReportTag with Entity]
Expand All @@ -60,12 +65,50 @@ case class RichObservable(
def ioc: Boolean = observable.ioc
def sighted: Boolean = observable.sighted
def ignoreSimilarity: Option[Boolean] = observable.ignoreSimilarity
def dataOrAttachment: Either[String, Attachment with Entity] = observable.data.toLeft(attachment.get)
def dataOrAttachment: Either[String, Attachment with Entity] = data.toLeft(attachment.get)
def dataType: String = observable.dataType
def data: Option[String] = observable.data
def data: Option[String] = fullData.map(d => d.fullData.getOrElse(d.data))
def tags: Seq[String] = observable.tags
}

@DefineIndex(IndexType.standard, "data")
@BuildVertexEntity
case class Data(data: String)
case class Data(data: String, fullData: Option[String])

object UseHashToIndex extends ImmenseTermProcessor with ImmenseStringTermFilter {
override val termSizeLimit: Int = 8191
private val hasher: Hasher = Hasher("SHA-256")

def hashToIndex(value: String): Option[String] =
if (value.length > termSizeLimit) Some(hasher.fromString(value).head.toString)
else None

override def apply[V](vertex: Vertex, property: VertexProperty[V]): Boolean = {
if (property.key() == "data")
vertex.label() match {
case "Observable" =>
collect(vertex, property).foreach { strProp =>
val currentValue = strProp.value()
logger.info(s"""Use hash for observable ~${vertex.id()}:
| dataType=${UMapping.string.getProperty(vertex, "dataType")}
| data=$currentValue
| message=${UMapping.string.optional.getProperty(vertex, "message").getOrElse("<not set>")}
| tags=${UMapping.string.sequence.getProperty(vertex, "message").mkString(", ")}""".stripMargin)
strProp.remove()
vertex.property(strProp.key(), hasher.fromString(currentValue).head.toString)
}

case "Data" =>
collect(vertex, property).foreach { strProp =>
val currentValue = strProp.value()
logger.info(s"Use hash and move data for $vertex/${strProp.key()}: $currentValue")
strProp.remove()
vertex.property(strProp.key(), hasher.fromString(currentValue).head.toString)
vertex.property("fullData", currentValue)
}

case _ =>
}
false
}
}
37 changes: 22 additions & 15 deletions thehive/app/org/thp/thehive/services/ObservableSrv.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import org.thp.scalligraph.services._
import org.thp.scalligraph.traversal.Converter.Identity
import org.thp.scalligraph.traversal.TraversalOps._
import org.thp.scalligraph.traversal.{Converter, Graph, StepLabel, Traversal}
import org.thp.scalligraph.utils.Hash
import org.thp.scalligraph.utils.{Hash, Hasher}
import org.thp.scalligraph.{BadRequestError, CreateError, EntityId, EntityIdOrName, EntityName, RichSeq}
import org.thp.thehive.models._
import org.thp.thehive.services.AlertOps._
Expand Down Expand Up @@ -76,7 +76,7 @@ class ObservableSrv @Inject() (
_ <- observableObservableTypeSrv.create(ObservableObservableType(), createdObservable, observableType)
_ <- observableAttachmentSrv.create(ObservableAttachment(), createdObservable, attachment)
_ <- tags.toTry(observableTagSrv.create(ObservableTag(), createdObservable, _))
} yield RichObservable(createdObservable, Some(attachment), None, Nil)
} yield RichObservable(createdObservable, None, Some(attachment), None, Nil)
}

def create(
Expand All @@ -86,10 +86,11 @@ class ObservableSrv @Inject() (
graph: Graph,
authContext: AuthContext
): Try[RichObservable] = {
val (dataOrHash, fullData) = UseHashToIndex.hashToIndex(dataValue).fold[(String, Option[String])](dataValue -> None)(_ -> Some(dataValue))
val alreadyExists = startTraversal
.has(_.organisationIds, organisationSrv.currentId)
.has(_.relatedId, observable.relatedId)
.has(_.data, dataValue)
.has(_.data, dataOrHash)
.has(_.dataType, observable.dataType)
.exists
if (alreadyExists) Failure(CreateError("Observable already exists"))
Expand All @@ -100,12 +101,12 @@ class ObservableSrv @Inject() (
if (observableType.isAttachment) Failure(BadRequestError("A attachment observable doesn't accept string value"))
else Success(())
tags <- observable.tags.toTry(tagSrv.getOrCreate)
data <- dataSrv.create(Data(dataValue))
createdObservable <- createEntity(observable.copy(data = Some(dataValue)))
data <- dataSrv.create(Data(dataOrHash, fullData))
createdObservable <- createEntity(observable.copy(data = Some(dataOrHash)))
_ <- observableObservableTypeSrv.create(ObservableObservableType(), createdObservable, observableType)
_ <- observableDataSrv.create(ObservableData(), createdObservable, data)
_ <- tags.toTry(observableTagSrv.create(ObservableTag(), createdObservable, _))
} yield RichObservable(createdObservable, None, None, Nil)
} yield RichObservable(createdObservable, Some(data), None, None, Nil)
}

def addTags(observable: Observable with Entity, tags: Set[String])(implicit graph: Graph, authContext: AuthContext): Try[Seq[Tag with Entity]] = {
Expand Down Expand Up @@ -289,14 +290,16 @@ object ObservableOps {
traversal
.project(
_.by
.by(_.attachments.fold)
.by(_.data.option)
.by(_.attachments.option)
.by(_.reportTags.fold)
)
.domainMap {
case (observable, attachment, reportTags) =>
case (observable, data, attachment, reportTags) =>
RichObservable(
observable,
attachment.headOption,
data,
attachment,
None,
reportTags
)
Expand All @@ -308,15 +311,17 @@ object ObservableOps {
traversal
.project(
_.by
.by(_.attachments.fold)
.by(_.data.option)
.by(_.attachments.option)
.by(_.filteredSimilar.visible(organisationSrv).limit(1).count)
.by(_.reportTags.fold)
)
.domainMap {
case (observable, attachment, count, reportTags) =>
case (observable, data, attachment, count, reportTags) =>
RichObservable(
observable,
attachment.headOption,
data,
attachment,
Some(count != 0),
reportTags
)
Expand All @@ -329,16 +334,18 @@ object ObservableOps {
traversal
.project(
_.by
.by(_.attachments.fold)
.by(_.data.option)
.by(_.attachments.option)
.by(_.filteredSimilar.visible(organisationSrv).limit(1).count)
.by(_.reportTags.fold)
.by(entityRenderer)
)
.domainMap {
case (observable, attachment, count, reportTags, renderedEntity) =>
case (observable, data, attachment, count, reportTags, renderedEntity) =>
RichObservable(
observable,
attachment.headOption,
data,
attachment,
Some(count != 0),
reportTags
) -> renderedEntity
Expand Down
5 changes: 2 additions & 3 deletions thehive/test/org/thp/thehive/DatabaseBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package org.thp.thehive
import org.scalactic.Or
import org.thp.scalligraph.auth.{AuthContext, AuthContextImpl}
import org.thp.scalligraph.controllers._
import org.thp.scalligraph.models.{Database, Entity, Schema}
import org.thp.scalligraph.models.{Database, Entity}
import org.thp.scalligraph.services.{EdgeSrv, GenIntegrityCheckOps, VertexSrv}
import org.thp.scalligraph.traversal.Graph
import org.thp.scalligraph.traversal.TraversalOps._
Expand All @@ -29,7 +29,6 @@ import scala.util.{Failure, Success, Try}

@Singleton
class DatabaseBuilder @Inject() (
schema: Schema,
alertSrv: AlertSrv,
attachmentSrv: AttachmentSrv,
caseSrv: CaseSrv,
Expand Down Expand Up @@ -233,7 +232,7 @@ class DatabaseBuilder @Inject() (
dataSrv
.getByName(data)
.getOrFail("data")
.orElse(dataSrv.create(Data(data)))
.orElse(dataSrv.create(Data(data, None)))
.flatMap(observableSrv.observableDataSrv.create(ObservableData(), observable, _))
.get
)
Expand Down

0 comments on commit 8a4ac2b

Please sign in to comment.