From 8a4ac2bd3802eaff1b9b5c7c4f6540c340354742 Mon Sep 17 00:00:00 2001 From: To-om Date: Tue, 14 Dec 2021 16:46:59 +0100 Subject: [PATCH] #2288 Use hash for big data --- .../thp/thehive/migration/th4/Output.scala | 11 ++-- .../app/org/thp/thehive/TheHiveModule.scala | 6 ++- .../controllers/v0/ObservableCtrl.scala | 14 ++++- .../thehive/controllers/v1/Properties.scala | 14 ++++- .../org/thp/thehive/models/Observable.scala | 51 +++++++++++++++++-- .../thp/thehive/services/ObservableSrv.scala | 37 ++++++++------ .../org/thp/thehive/DatabaseBuilder.scala | 5 +- 7 files changed, 107 insertions(+), 31 deletions(-) diff --git a/migration/src/main/scala/org/thp/thehive/migration/th4/Output.scala b/migration/src/main/scala/org/thp/thehive/migration/th4/Output.scala index 5fdfdfd814..0eafba74e5 100644 --- a/migration/src/main/scala/org/thp/thehive/migration/th4/Output.scala +++ b/migration/src/main/scala/org/thp/thehive/migration/th4/Output.scala @@ -630,9 +630,11 @@ class Output @Inject() ( } yield IdMapping(inputLog.metaData.id, log._id) } - private def getData(value: String)(implicit graph: Graph, authContext: AuthContext): Try[Data with Entity] = - if (observableDataIsIndexed) dataSrv.create(Data(value)) - else dataSrv.createEntity(Data(value)) + private def getData(value: String)(implicit graph: Graph, authContext: AuthContext): Try[Data with Entity] = { + val (dataOrHash, fullData) = UseHashToIndex.hashToIndex(value).fold[(String, Option[String])](value -> None)(_ -> Some(value)) + if (observableDataIsIndexed) dataSrv.create(Data(dataOrHash, fullData)) + else dataSrv.createEntity(Data(dataOrHash, fullData)) + } private def createSimpleObservable(observable: Observable, observableType: ObservableType with Entity, dataValue: String)(implicit graph: Graph, @@ -700,7 +702,8 @@ class Output @Inject() ( richObservable <- createObservable(caseId, inputObservable, organisations.map(_._id).toSet) _ <- reportTagSrv.updateTags(richObservable, inputObservable.reportTags) case0 <- getCase(caseId) - _ <- organisations.toTry(o => shareSrv.shareObservable(RichObservable(richObservable, None, None, Nil), case0, o._id)) + // the data in richObservable is not set because it is not used in shareSrv + _ <- organisations.toTry(o => shareSrv.shareObservable(RichObservable(richObservable, None, None, None, Nil), case0, o._id)) } yield IdMapping(inputObservable.metaData.id, richObservable._id) } diff --git a/thehive/app/org/thp/thehive/TheHiveModule.scala b/thehive/app/org/thp/thehive/TheHiveModule.scala index 988e8101b2..e2ab245244 100644 --- a/thehive/app/org/thp/thehive/TheHiveModule.scala +++ b/thehive/app/org/thp/thehive/TheHiveModule.scala @@ -6,11 +6,11 @@ import com.google.inject.AbstractModule import net.codingwell.scalaguice.{ScalaModule, ScalaMultibinder} import org.thp.scalligraph.SingleInstance import org.thp.scalligraph.auth._ -import org.thp.scalligraph.janus.JanusDatabaseProvider +import org.thp.scalligraph.janus.{ImmenseTermProcessor, JanusDatabaseProvider} import org.thp.scalligraph.models.{Database, UpdatableSchema} import org.thp.scalligraph.services.{GenIntegrityCheckOps, HadoopStorageSrv, S3StorageSrv} import org.thp.thehive.controllers.v0.QueryExecutorVersion0Provider -import org.thp.thehive.models.TheHiveSchemaDefinition +import org.thp.thehive.models.{TheHiveSchemaDefinition, UseHashToIndex} import org.thp.thehive.services.notification.notifiers._ import org.thp.thehive.services.notification.triggers._ import org.thp.thehive.services.{UserSrv => _, _} @@ -112,6 +112,8 @@ class TheHiveModule(environment: Environment, configuration: Configuration) exte bind[ActorRef].annotatedWithName("flow-actor").toProvider[FlowActorProvider] bind[SingleInstance].to[ClusterSetup].asEagerSingleton() + + ImmenseTermProcessor.registerStrategy("observableHashToIndex", _ => UseHashToIndex) () } } diff --git a/thehive/app/org/thp/thehive/controllers/v0/ObservableCtrl.scala b/thehive/app/org/thp/thehive/controllers/v0/ObservableCtrl.scala index 0ef5aceb1d..8853f0c9cc 100644 --- a/thehive/app/org/thp/thehive/controllers/v0/ObservableCtrl.scala +++ b/thehive/app/org/thp/thehive/controllers/v0/ObservableCtrl.scala @@ -2,13 +2,16 @@ package org.thp.thehive.controllers.v0 import net.lingala.zip4j.ZipFile import net.lingala.zip4j.model.FileHeader +import org.apache.tinkerpop.gremlin.process.traversal.Compare import org.thp.scalligraph._ import org.thp.scalligraph.auth.AuthContext import org.thp.scalligraph.controllers._ import org.thp.scalligraph.models.{Database, Entity, UMapping} +import org.thp.scalligraph.query.PredicateOps.PredicateOpsDefs import org.thp.scalligraph.query._ import org.thp.scalligraph.traversal.TraversalOps._ import org.thp.scalligraph.traversal.{IteratorOutput, Traversal} +import org.thp.scalligraph.utils.Hasher import org.thp.thehive.controllers.v0.Conversion._ import org.thp.thehive.dto.v0.{InputAttachment, InputObservable} import org.thp.thehive.models._ @@ -417,6 +420,7 @@ class PublicObservable @Inject() ( Query[Traversal.V[Observable], Traversal.V[Case]]("case", (observableSteps, _) => observableSteps.`case`), Query[Traversal.V[Observable], Traversal.V[Alert]]("alert", (observableSteps, _) => observableSteps.alert) ) + lazy val hasher: Hasher = Hasher("SHA-256") override val publicProperties: PublicProperties = PublicPropertyListBuilder[Observable] .property("status", UMapping.string)(_.select(_.constant("Ok")).readonly) .property("startDate", UMapping.date)(_.select(_._createdAt).readonly) @@ -445,7 +449,15 @@ class PublicObservable @Inject() ( _ <- observableSrv.updateType(observable, newDataType)(graph, authContext) } yield Json.obj("dataType" -> value) }) - .property("data", UMapping.string.optional)(_.field.readonly) + .property("data", UMapping.string.optional)( + _.select(_.value(_.data)) + .filter[String] { + case (_, observables, _, Right(predicate)) => observables.has(_.data, predicate.mapValue(v => UseHashToIndex.hashToIndex(v).getOrElse(v))) + case (_, observables, _, Left(true)) => observables.has(_.data) + case (_, observables, _, Left(false)) => observables.hasNot(_.data) + } + .readonly + ) .property("attachment.name", UMapping.string.optional)(_.select(_.attachments.value(_.name)).readonly) .property("attachment.hashes", UMapping.hash.sequence)(_.select(_.attachments.value(_.hashes)).readonly) .property("attachment.size", UMapping.long.optional)(_.select(_.attachments.value(_.size)).readonly) diff --git a/thehive/app/org/thp/thehive/controllers/v1/Properties.scala b/thehive/app/org/thp/thehive/controllers/v1/Properties.scala index 588c83efc7..f07b9cbf7f 100644 --- a/thehive/app/org/thp/thehive/controllers/v1/Properties.scala +++ b/thehive/app/org/thp/thehive/controllers/v1/Properties.scala @@ -1,11 +1,13 @@ package org.thp.thehive.controllers.v1 +import org.apache.tinkerpop.gremlin.process.traversal.Compare import org.apache.tinkerpop.gremlin.structure.T import org.thp.scalligraph.controllers.{FPathElem, FPathEmpty, FString} import org.thp.scalligraph.models.{Database, UMapping} import org.thp.scalligraph.query.PredicateOps._ import org.thp.scalligraph.query.{PublicProperties, PublicPropertyListBuilder} import org.thp.scalligraph.traversal.TraversalOps._ +import org.thp.scalligraph.utils.Hasher import org.thp.scalligraph.{BadRequestError, EntityId, EntityIdOrName, InvalidFormatAttributeError, RichSeq} import org.thp.thehive.dto.v1.InputCustomFieldValue import org.thp.thehive.models._ @@ -28,7 +30,7 @@ import org.thp.thehive.services._ import play.api.libs.json.{JsObject, JsValue, Json} import javax.inject.{Inject, Singleton} -import scala.util.{Failure, Success, Try} +import scala.util.{Failure, Success} @Singleton class Properties @Inject() ( @@ -452,7 +454,15 @@ class Properties @Inject() ( _ <- observableSrv.updateType(observable, newDataType)(graph, authContext) } yield Json.obj("dataType" -> value) }) - .property("data", UMapping.string.optional)(_.field.readonly) + .property("data", UMapping.string.optional)( + _.select(_.value(_.data)) + .filter[String] { + case (_, observables, _, Right(predicate)) => observables.has(_.data, predicate.mapValue(v => UseHashToIndex.hashToIndex(v).getOrElse(v))) + case (_, observables, _, Left(true)) => observables.has(_.data) + case (_, observables, _, Left(false)) => observables.hasNot(_.data) + } + .readonly + ) .property("attachment.name", UMapping.string.optional)(_.select(_.attachments.value(_.name)).readonly) .property("attachment.hashes", UMapping.hash.sequence)(_.select(_.attachments.value(_.hashes)).readonly) .property("attachment.size", UMapping.long.optional)(_.select(_.attachments.value(_.size)).readonly) diff --git a/thehive/app/org/thp/thehive/models/Observable.scala b/thehive/app/org/thp/thehive/models/Observable.scala index b0a92a85e8..63d6b21c1c 100644 --- a/thehive/app/org/thp/thehive/models/Observable.scala +++ b/thehive/app/org/thp/thehive/models/Observable.scala @@ -1,9 +1,13 @@ package org.thp.thehive.models -import org.thp.scalligraph.models.{DefineIndex, Entity, IndexType} +import org.apache.tinkerpop.gremlin.structure.{Vertex, VertexProperty} +import org.thp.scalligraph.janus.{ImmenseStringTermFilter, ImmenseTermProcessor} +import org.thp.scalligraph.models.{DefineIndex, Entity, IndexType, UMapping} +import org.thp.scalligraph.utils.Hasher import org.thp.scalligraph.{BuildEdgeEntity, BuildVertexEntity, EntityId} import java.util.Date +import scala.util.Try @BuildEdgeEntity[Observable, KeyValue] case class ObservableKeyValue() @@ -46,6 +50,7 @@ case class Observable( case class RichObservable( observable: Observable with Entity, + fullData: Option[Data with Entity], attachment: Option[Attachment with Entity], seen: Option[Boolean], reportTags: Seq[ReportTag with Entity] @@ -60,12 +65,50 @@ case class RichObservable( def ioc: Boolean = observable.ioc def sighted: Boolean = observable.sighted def ignoreSimilarity: Option[Boolean] = observable.ignoreSimilarity - def dataOrAttachment: Either[String, Attachment with Entity] = observable.data.toLeft(attachment.get) + def dataOrAttachment: Either[String, Attachment with Entity] = data.toLeft(attachment.get) def dataType: String = observable.dataType - def data: Option[String] = observable.data + def data: Option[String] = fullData.map(d => d.fullData.getOrElse(d.data)) def tags: Seq[String] = observable.tags } @DefineIndex(IndexType.standard, "data") @BuildVertexEntity -case class Data(data: String) +case class Data(data: String, fullData: Option[String]) + +object UseHashToIndex extends ImmenseTermProcessor with ImmenseStringTermFilter { + override val termSizeLimit: Int = 8191 + private val hasher: Hasher = Hasher("SHA-256") + + def hashToIndex(value: String): Option[String] = + if (value.length > termSizeLimit) Some(hasher.fromString(value).head.toString) + else None + + override def apply[V](vertex: Vertex, property: VertexProperty[V]): Boolean = { + if (property.key() == "data") + vertex.label() match { + case "Observable" => + collect(vertex, property).foreach { strProp => + val currentValue = strProp.value() + logger.info(s"""Use hash for observable ~${vertex.id()}: + | dataType=${UMapping.string.getProperty(vertex, "dataType")} + | data=$currentValue + | message=${UMapping.string.optional.getProperty(vertex, "message").getOrElse("")} + | tags=${UMapping.string.sequence.getProperty(vertex, "message").mkString(", ")}""".stripMargin) + strProp.remove() + vertex.property(strProp.key(), hasher.fromString(currentValue).head.toString) + } + + case "Data" => + collect(vertex, property).foreach { strProp => + val currentValue = strProp.value() + logger.info(s"Use hash and move data for $vertex/${strProp.key()}: $currentValue") + strProp.remove() + vertex.property(strProp.key(), hasher.fromString(currentValue).head.toString) + vertex.property("fullData", currentValue) + } + + case _ => + } + false + } +} diff --git a/thehive/app/org/thp/thehive/services/ObservableSrv.scala b/thehive/app/org/thp/thehive/services/ObservableSrv.scala index 985b7a2302..91ffe952fb 100644 --- a/thehive/app/org/thp/thehive/services/ObservableSrv.scala +++ b/thehive/app/org/thp/thehive/services/ObservableSrv.scala @@ -10,7 +10,7 @@ import org.thp.scalligraph.services._ import org.thp.scalligraph.traversal.Converter.Identity import org.thp.scalligraph.traversal.TraversalOps._ import org.thp.scalligraph.traversal.{Converter, Graph, StepLabel, Traversal} -import org.thp.scalligraph.utils.Hash +import org.thp.scalligraph.utils.{Hash, Hasher} import org.thp.scalligraph.{BadRequestError, CreateError, EntityId, EntityIdOrName, EntityName, RichSeq} import org.thp.thehive.models._ import org.thp.thehive.services.AlertOps._ @@ -76,7 +76,7 @@ class ObservableSrv @Inject() ( _ <- observableObservableTypeSrv.create(ObservableObservableType(), createdObservable, observableType) _ <- observableAttachmentSrv.create(ObservableAttachment(), createdObservable, attachment) _ <- tags.toTry(observableTagSrv.create(ObservableTag(), createdObservable, _)) - } yield RichObservable(createdObservable, Some(attachment), None, Nil) + } yield RichObservable(createdObservable, None, Some(attachment), None, Nil) } def create( @@ -86,10 +86,11 @@ class ObservableSrv @Inject() ( graph: Graph, authContext: AuthContext ): Try[RichObservable] = { + val (dataOrHash, fullData) = UseHashToIndex.hashToIndex(dataValue).fold[(String, Option[String])](dataValue -> None)(_ -> Some(dataValue)) val alreadyExists = startTraversal .has(_.organisationIds, organisationSrv.currentId) .has(_.relatedId, observable.relatedId) - .has(_.data, dataValue) + .has(_.data, dataOrHash) .has(_.dataType, observable.dataType) .exists if (alreadyExists) Failure(CreateError("Observable already exists")) @@ -100,12 +101,12 @@ class ObservableSrv @Inject() ( if (observableType.isAttachment) Failure(BadRequestError("A attachment observable doesn't accept string value")) else Success(()) tags <- observable.tags.toTry(tagSrv.getOrCreate) - data <- dataSrv.create(Data(dataValue)) - createdObservable <- createEntity(observable.copy(data = Some(dataValue))) + data <- dataSrv.create(Data(dataOrHash, fullData)) + createdObservable <- createEntity(observable.copy(data = Some(dataOrHash))) _ <- observableObservableTypeSrv.create(ObservableObservableType(), createdObservable, observableType) _ <- observableDataSrv.create(ObservableData(), createdObservable, data) _ <- tags.toTry(observableTagSrv.create(ObservableTag(), createdObservable, _)) - } yield RichObservable(createdObservable, None, None, Nil) + } yield RichObservable(createdObservable, Some(data), None, None, Nil) } def addTags(observable: Observable with Entity, tags: Set[String])(implicit graph: Graph, authContext: AuthContext): Try[Seq[Tag with Entity]] = { @@ -289,14 +290,16 @@ object ObservableOps { traversal .project( _.by - .by(_.attachments.fold) + .by(_.data.option) + .by(_.attachments.option) .by(_.reportTags.fold) ) .domainMap { - case (observable, attachment, reportTags) => + case (observable, data, attachment, reportTags) => RichObservable( observable, - attachment.headOption, + data, + attachment, None, reportTags ) @@ -308,15 +311,17 @@ object ObservableOps { traversal .project( _.by - .by(_.attachments.fold) + .by(_.data.option) + .by(_.attachments.option) .by(_.filteredSimilar.visible(organisationSrv).limit(1).count) .by(_.reportTags.fold) ) .domainMap { - case (observable, attachment, count, reportTags) => + case (observable, data, attachment, count, reportTags) => RichObservable( observable, - attachment.headOption, + data, + attachment, Some(count != 0), reportTags ) @@ -329,16 +334,18 @@ object ObservableOps { traversal .project( _.by - .by(_.attachments.fold) + .by(_.data.option) + .by(_.attachments.option) .by(_.filteredSimilar.visible(organisationSrv).limit(1).count) .by(_.reportTags.fold) .by(entityRenderer) ) .domainMap { - case (observable, attachment, count, reportTags, renderedEntity) => + case (observable, data, attachment, count, reportTags, renderedEntity) => RichObservable( observable, - attachment.headOption, + data, + attachment, Some(count != 0), reportTags ) -> renderedEntity diff --git a/thehive/test/org/thp/thehive/DatabaseBuilder.scala b/thehive/test/org/thp/thehive/DatabaseBuilder.scala index eb3190e31e..84303cbbcc 100644 --- a/thehive/test/org/thp/thehive/DatabaseBuilder.scala +++ b/thehive/test/org/thp/thehive/DatabaseBuilder.scala @@ -3,7 +3,7 @@ package org.thp.thehive import org.scalactic.Or import org.thp.scalligraph.auth.{AuthContext, AuthContextImpl} import org.thp.scalligraph.controllers._ -import org.thp.scalligraph.models.{Database, Entity, Schema} +import org.thp.scalligraph.models.{Database, Entity} import org.thp.scalligraph.services.{EdgeSrv, GenIntegrityCheckOps, VertexSrv} import org.thp.scalligraph.traversal.Graph import org.thp.scalligraph.traversal.TraversalOps._ @@ -29,7 +29,6 @@ import scala.util.{Failure, Success, Try} @Singleton class DatabaseBuilder @Inject() ( - schema: Schema, alertSrv: AlertSrv, attachmentSrv: AttachmentSrv, caseSrv: CaseSrv, @@ -233,7 +232,7 @@ class DatabaseBuilder @Inject() ( dataSrv .getByName(data) .getOrFail("data") - .orElse(dataSrv.create(Data(data))) + .orElse(dataSrv.create(Data(data, None))) .flatMap(observableSrv.observableDataSrv.create(ObservableData(), observable, _)) .get )