From 9c408eb2fb49932a5dc209fcc23bfbf1601f68e9 Mon Sep 17 00:00:00 2001
From: cccs-rs <62077998+cccs-rs@users.noreply.github.com>
Date: Tue, 4 Mar 2025 05:28:04 +0000
Subject: [PATCH 1/4] Add missing control characters w/ testing

---
 assemblyline/common/str_utils.py | 76 +++++++++++++++++++++++---------
 test/test_str_utils.py           | 13 ++++--
 2 files changed, 65 insertions(+), 24 deletions(-)

diff --git a/assemblyline/common/str_utils.py b/assemblyline/common/str_utils.py
index 9150a1cbe..4494f19de 100644
--- a/assemblyline/common/str_utils.py
+++ b/assemblyline/common/str_utils.py
@@ -1,26 +1,49 @@
 import re
 from copy import copy
+from enum import Enum
 from typing import Literal, Union, overload
 
 import chardet
 
-
-def remove_bidir_unicode_controls(in_str):
+# Reference: https://unicode.org/reports/tr9/#Directional_Formatting_Characters
+class DirectionalFormattingCharacter(Enum):
+    LRE = u'\u202A' # Left-to-Right Embedding
+    RLE = u'\u202B' # Right-to-Left Embedding
+    PDF = u'\u202C' # Pop Directional Formatting
+    LRO = u'\u202D' # Left-to-Right Override
+    RLO = u'\u202E' # Right-to-Left Override
+    LRI = u'\u2066' # Left-to-Right Isolate
+    RLI = u'\u2067' # Right-to-Left Isolate
+    FSI = u'\u2068' # First Strong Isolate
+    PDI = u'\u2069' # Pop Directional Isolate
+    LRM = u'\u200E' # Left-to-Right Mark
+    RLM = u'\u200F' # Right-to-Left Mark
+    ALM = u'\u061C' # Arabic Letter Mark
+
+CONTROL_CHARS = []
+EO_CONTROL_CHARS = []
+I_CONTROL_CHARS = []
+
+for c in DirectionalFormattingCharacter:
+    CONTROL_CHARS.append(c.value)
+    if c in [DirectionalFormattingCharacter.LRE, DirectionalFormattingCharacter.RLE,
+             DirectionalFormattingCharacter.LRO, DirectionalFormattingCharacter.RLO]:
+        EO_CONTROL_CHARS.append(c.value)
+    elif c in [DirectionalFormattingCharacter.LRI, DirectionalFormattingCharacter.RLI,
+               DirectionalFormattingCharacter.FSI]:
+        I_CONTROL_CHARS.append(c.value)
+
+def remove_bidir_unicode_controls(in_str: str):
     # noinspection PyBroadException
     try:
-        no_controls_str = ''.join(
-            c for c in in_str if c not in [
-                u'\u202E', u'\u202B', u'\u202D',
-                u'\u202A', u'\u200E', u'\u200F',
-            ]
-        )
+        no_controls_str = ''.join(c for c in in_str if c not in CONTROL_CHARS)
     except Exception:
         no_controls_str = in_str
 
     return no_controls_str
 
 
-def wrap_bidir_unicode_string(uni_str):
+def wrap_bidir_unicode_string(uni_str: Union[str, bytes]) -> Union[str, bytes]:
     """
     Wraps str in a LRE (Left-to-Right Embed) unicode control
     Guarantees that str can be concatenated to other strings without
@@ -30,25 +53,38 @@ def wrap_bidir_unicode_string(uni_str):
     if len(uni_str) == 0 or isinstance(uni_str, bytes):  # Not str, return it unchanged
         return uni_str
 
-    re_obj = re.search(r'[\u202E\u202B\u202D\u202A\u200E\u200F]', uni_str)
+    re_obj = re.search(rf"[{''.join(CONTROL_CHARS)}]", uni_str)
     if re_obj is None or len(re_obj.group()) == 0:  # No unicode bidir controls found, return string unchanged
         return uni_str
 
     # Parse str for unclosed bidir blocks
-    count = 0
+    idf_count = 0   # Isolate Directional Formatting Count
+    eodf_count = 0  # Embedding and Override Directional Formatting Count
+
     for letter in uni_str:
-        if letter in [u'\u202A', u'\u202B', u'\u202D', u'\u202E']:  # bidir block open?
-            count += 1
-        elif letter == u'\u202c':
-            if count > 0:
-                count -= 1
+        # Look for block open with embedded or override characters
+        if letter in EO_CONTROL_CHARS:
+            eodf_count += 1
+        # Look for block close with embedded or override characters
+        elif letter == DirectionalFormattingCharacter.PDF.value:
+            if eodf_count > 0:
+                eodf_count -= 1
+        # Look for block open with isolate characters
+        elif letter in I_CONTROL_CHARS:
+            idf_count += 1
+        # Look for block close with isolate characters
+        elif letter == DirectionalFormattingCharacter.PDI.value:
+            if idf_count > 0:
+                idf_count -= 1
 
     # close all bidir blocks
-    if count > 0:
-        uni_str += (u'\u202c' * count)
+    if eodf_count > 0:
+        uni_str += (DirectionalFormattingCharacter.PDF.value * eodf_count)
+    if idf_count > 0:
+        uni_str += (DirectionalFormattingCharacter.PDI.value * idf_count)
 
-        # Final wrapper (LTR block) to neutralize any Marks (u+200E and u+200F)
-    uni_str = u'\u202A' + uni_str + u'\u202C'
+    # Final wrapper (LTR block) to neutralize any Marks (u+200E and u+200F)
+    uni_str = DirectionalFormattingCharacter.LRE.value + uni_str + DirectionalFormattingCharacter.PDF.value
 
     return uni_str
 
diff --git a/test/test_str_utils.py b/test/test_str_utils.py
index bb5e4d35f..1e885d161 100644
--- a/test/test_str_utils.py
+++ b/test/test_str_utils.py
@@ -74,17 +74,22 @@ def test_truncate():
 
 
 def test_remove_bidir_unicode_controls():
-    test_str = 'a\u202Db\u202Ac\u200Ed\u200Fe\u202Efg\u202B'
-    assert str_utils.remove_bidir_unicode_controls(test_str) == 'abcdefg'
+    test_str = 'a'.join(str_utils.CONTROL_CHARS)
+    assert str_utils.remove_bidir_unicode_controls(test_str) == 'a' * (len(str_utils.CONTROL_CHARS) - 1)
 
     other_test_str = 'abcdéfg'
     assert str_utils.remove_bidir_unicode_controls(other_test_str) == 'abcdéfg'
 
 
 def test_wrap_bidir_unicode_string():
-    test_str = 'a\u202Db\u202Acde\u202Efg\u202B'
+    from assemblyline.common.str_utils import DirectionalFormattingCharacter as DFC
+    test_str = 'a'.join(str_utils.EO_CONTROL_CHARS) + 'a'.join(str_utils.I_CONTROL_CHARS)
     a = str_utils.wrap_bidir_unicode_string(test_str)
-    assert a == '\u202aa\u202db\u202Acde\u202efg\u202b\u202c\u202c\u202c\u202c\u202c'
+    assert a == DFC.LRE.value + \
+                test_str + \
+                len(str_utils.EO_CONTROL_CHARS) * DFC.PDF.value + \
+                len(str_utils.I_CONTROL_CHARS) * DFC.PDI.value + \
+                DFC.PDF.value
 
     byte_str = b'\u202Dabcdefg'
     assert str_utils.wrap_bidir_unicode_string(byte_str) == b'\u202Dabcdefg'

From f811fe4c2c95a6c70248146393a201373ba49519 Mon Sep 17 00:00:00 2001
From: cccs-rs <62077998+cccs-rs@users.noreply.github.com>
Date: Tue, 4 Mar 2025 08:33:20 -0500
Subject: [PATCH 2/4] Update azure-tests.yaml

---
 pipelines/azure-tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/azure-tests.yaml b/pipelines/azure-tests.yaml
index f12c98af4..c8cf12580 100644
--- a/pipelines/azure-tests.yaml
+++ b/pipelines/azure-tests.yaml
@@ -107,7 +107,7 @@ jobs:
         displayName: Test
   - job: run_identify_test_latest
     container:
-      image: cccstemp.azurecr.io/assemblyline-root-build:dev
+      image: cccstemp.azurecr.io/assemblyline-root-build:latest
       endpoint: cccstemp
     timeoutInMinutes: 10
     steps:

From 4b4d5addf67cb0cac5b6b5e87784064175cf636d Mon Sep 17 00:00:00 2001
From: cccs-rs <62077998+cccs-rs@users.noreply.github.com>
Date: Fri, 7 Mar 2025 12:12:46 -0500
Subject: [PATCH 3/4] Update assemblyline/common/str_utils.py

Co-authored-by: gdesmar <75089569+gdesmar@users.noreply.github.com>
---
 assemblyline/common/str_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assemblyline/common/str_utils.py b/assemblyline/common/str_utils.py
index 4494f19de..4da0b91c9 100644
--- a/assemblyline/common/str_utils.py
+++ b/assemblyline/common/str_utils.py
@@ -83,7 +83,7 @@ def wrap_bidir_unicode_string(uni_str: Union[str, bytes]) -> Union[str, bytes]:
     if idf_count > 0:
         uni_str += (DirectionalFormattingCharacter.PDI.value * idf_count)
 
-    # Final wrapper (LTR block) to neutralize any Marks (u+200E and u+200F)
+    # Final wrapper (LTR block) to neutralize any Marks (u+200E, u+200F and u+061C)
     uni_str = DirectionalFormattingCharacter.LRE.value + uni_str + DirectionalFormattingCharacter.PDF.value
 
     return uni_str

From 547955085b48c02cb9d104e5cfe88ae1147ef26b Mon Sep 17 00:00:00 2001
From: cccs-rs <62077998+cccs-rs@users.noreply.github.com>
Date: Fri, 7 Mar 2025 17:48:30 +0000
Subject: [PATCH 4/4] Insert PDF unicode character in test string to ensure
 proper wrapping

---
 test/test_str_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_str_utils.py b/test/test_str_utils.py
index 1e885d161..e6cedb5b9 100644
--- a/test/test_str_utils.py
+++ b/test/test_str_utils.py
@@ -83,11 +83,11 @@ def test_remove_bidir_unicode_controls():
 
 def test_wrap_bidir_unicode_string():
     from assemblyline.common.str_utils import DirectionalFormattingCharacter as DFC
-    test_str = 'a'.join(str_utils.EO_CONTROL_CHARS) + 'a'.join(str_utils.I_CONTROL_CHARS)
+    test_str = 'a'.join(str_utils.EO_CONTROL_CHARS) + DFC.PDF.value + 'a'.join(str_utils.I_CONTROL_CHARS)
     a = str_utils.wrap_bidir_unicode_string(test_str)
     assert a == DFC.LRE.value + \
                 test_str + \
-                len(str_utils.EO_CONTROL_CHARS) * DFC.PDF.value + \
+                (len(str_utils.EO_CONTROL_CHARS) - 1) * DFC.PDF.value + \
                 len(str_utils.I_CONTROL_CHARS) * DFC.PDI.value + \
                 DFC.PDF.value