From 9c408eb2fb49932a5dc209fcc23bfbf1601f68e9 Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Tue, 4 Mar 2025 05:28:04 +0000 Subject: [PATCH 1/4] Add missing control characters w/ testing --- assemblyline/common/str_utils.py | 76 +++++++++++++++++++++++--------- test/test_str_utils.py | 13 ++++-- 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/assemblyline/common/str_utils.py b/assemblyline/common/str_utils.py index 9150a1cbe..4494f19de 100644 --- a/assemblyline/common/str_utils.py +++ b/assemblyline/common/str_utils.py @@ -1,26 +1,49 @@ import re from copy import copy +from enum import Enum from typing import Literal, Union, overload import chardet - -def remove_bidir_unicode_controls(in_str): +# Reference: https://unicode.org/reports/tr9/#Directional_Formatting_Characters +class DirectionalFormattingCharacter(Enum): + LRE = u'\u202A' # Left-to-Right Embedding + RLE = u'\u202B' # Right-to-Left Embedding + PDF = u'\u202C' # Pop Directional Formatting + LRO = u'\u202D' # Left-to-Right Override + RLO = u'\u202E' # Right-to-Left Override + LRI = u'\u2066' # Left-to-Right Isolate + RLI = u'\u2067' # Right-to-Left Isolate + FSI = u'\u2068' # First Strong Isolate + PDI = u'\u2069' # Pop Directional Isolate + LRM = u'\u200E' # Left-to-Right Mark + RLM = u'\u200F' # Right-to-Left Mark + ALM = u'\u061C' # Arabic Letter Mark + +CONTROL_CHARS = [] +EO_CONTROL_CHARS = [] +I_CONTROL_CHARS = [] + +for c in DirectionalFormattingCharacter: + CONTROL_CHARS.append(c.value) + if c in [DirectionalFormattingCharacter.LRE, DirectionalFormattingCharacter.RLE, + DirectionalFormattingCharacter.LRO, DirectionalFormattingCharacter.RLO]: + EO_CONTROL_CHARS.append(c.value) + elif c in [DirectionalFormattingCharacter.LRI, DirectionalFormattingCharacter.RLI, + DirectionalFormattingCharacter.FSI]: + I_CONTROL_CHARS.append(c.value) + +def remove_bidir_unicode_controls(in_str: str): # noinspection PyBroadException try: - no_controls_str = ''.join( - c for c in in_str if c not in [ - u'\u202E', u'\u202B', u'\u202D', - u'\u202A', u'\u200E', u'\u200F', - ] - ) + no_controls_str = ''.join(c for c in in_str if c not in CONTROL_CHARS) except Exception: no_controls_str = in_str return no_controls_str -def wrap_bidir_unicode_string(uni_str): +def wrap_bidir_unicode_string(uni_str: Union[str, bytes]) -> Union[str, bytes]: """ Wraps str in a LRE (Left-to-Right Embed) unicode control Guarantees that str can be concatenated to other strings without @@ -30,25 +53,38 @@ def wrap_bidir_unicode_string(uni_str): if len(uni_str) == 0 or isinstance(uni_str, bytes): # Not str, return it unchanged return uni_str - re_obj = re.search(r'[\u202E\u202B\u202D\u202A\u200E\u200F]', uni_str) + re_obj = re.search(rf"[{''.join(CONTROL_CHARS)}]", uni_str) if re_obj is None or len(re_obj.group()) == 0: # No unicode bidir controls found, return string unchanged return uni_str # Parse str for unclosed bidir blocks - count = 0 + idf_count = 0 # Isolate Directional Formatting Count + eodf_count = 0 # Embedding and Override Directional Formatting Count + for letter in uni_str: - if letter in [u'\u202A', u'\u202B', u'\u202D', u'\u202E']: # bidir block open? - count += 1 - elif letter == u'\u202c': - if count > 0: - count -= 1 + # Look for block open with embedded or override characters + if letter in EO_CONTROL_CHARS: + eodf_count += 1 + # Look for block close with embedded or override characters + elif letter == DirectionalFormattingCharacter.PDF.value: + if eodf_count > 0: + eodf_count -= 1 + # Look for block open with isolate characters + elif letter in I_CONTROL_CHARS: + idf_count += 1 + # Look for block close with isolate characters + elif letter == DirectionalFormattingCharacter.PDI.value: + if idf_count > 0: + idf_count -= 1 # close all bidir blocks - if count > 0: - uni_str += (u'\u202c' * count) + if eodf_count > 0: + uni_str += (DirectionalFormattingCharacter.PDF.value * eodf_count) + if idf_count > 0: + uni_str += (DirectionalFormattingCharacter.PDI.value * idf_count) - # Final wrapper (LTR block) to neutralize any Marks (u+200E and u+200F) - uni_str = u'\u202A' + uni_str + u'\u202C' + # Final wrapper (LTR block) to neutralize any Marks (u+200E and u+200F) + uni_str = DirectionalFormattingCharacter.LRE.value + uni_str + DirectionalFormattingCharacter.PDF.value return uni_str diff --git a/test/test_str_utils.py b/test/test_str_utils.py index bb5e4d35f..1e885d161 100644 --- a/test/test_str_utils.py +++ b/test/test_str_utils.py @@ -74,17 +74,22 @@ def test_truncate(): def test_remove_bidir_unicode_controls(): - test_str = 'a\u202Db\u202Ac\u200Ed\u200Fe\u202Efg\u202B' - assert str_utils.remove_bidir_unicode_controls(test_str) == 'abcdefg' + test_str = 'a'.join(str_utils.CONTROL_CHARS) + assert str_utils.remove_bidir_unicode_controls(test_str) == 'a' * (len(str_utils.CONTROL_CHARS) - 1) other_test_str = 'abcdéfg' assert str_utils.remove_bidir_unicode_controls(other_test_str) == 'abcdéfg' def test_wrap_bidir_unicode_string(): - test_str = 'a\u202Db\u202Acde\u202Efg\u202B' + from assemblyline.common.str_utils import DirectionalFormattingCharacter as DFC + test_str = 'a'.join(str_utils.EO_CONTROL_CHARS) + 'a'.join(str_utils.I_CONTROL_CHARS) a = str_utils.wrap_bidir_unicode_string(test_str) - assert a == '\u202aa\u202db\u202Acde\u202efg\u202b\u202c\u202c\u202c\u202c\u202c' + assert a == DFC.LRE.value + \ + test_str + \ + len(str_utils.EO_CONTROL_CHARS) * DFC.PDF.value + \ + len(str_utils.I_CONTROL_CHARS) * DFC.PDI.value + \ + DFC.PDF.value byte_str = b'\u202Dabcdefg' assert str_utils.wrap_bidir_unicode_string(byte_str) == b'\u202Dabcdefg' From f811fe4c2c95a6c70248146393a201373ba49519 Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Tue, 4 Mar 2025 08:33:20 -0500 Subject: [PATCH 2/4] Update azure-tests.yaml --- pipelines/azure-tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/azure-tests.yaml b/pipelines/azure-tests.yaml index f12c98af4..c8cf12580 100644 --- a/pipelines/azure-tests.yaml +++ b/pipelines/azure-tests.yaml @@ -107,7 +107,7 @@ jobs: displayName: Test - job: run_identify_test_latest container: - image: cccstemp.azurecr.io/assemblyline-root-build:dev + image: cccstemp.azurecr.io/assemblyline-root-build:latest endpoint: cccstemp timeoutInMinutes: 10 steps: From 4b4d5addf67cb0cac5b6b5e87784064175cf636d Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Fri, 7 Mar 2025 12:12:46 -0500 Subject: [PATCH 3/4] Update assemblyline/common/str_utils.py Co-authored-by: gdesmar <75089569+gdesmar@users.noreply.github.com> --- assemblyline/common/str_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assemblyline/common/str_utils.py b/assemblyline/common/str_utils.py index 4494f19de..4da0b91c9 100644 --- a/assemblyline/common/str_utils.py +++ b/assemblyline/common/str_utils.py @@ -83,7 +83,7 @@ def wrap_bidir_unicode_string(uni_str: Union[str, bytes]) -> Union[str, bytes]: if idf_count > 0: uni_str += (DirectionalFormattingCharacter.PDI.value * idf_count) - # Final wrapper (LTR block) to neutralize any Marks (u+200E and u+200F) + # Final wrapper (LTR block) to neutralize any Marks (u+200E, u+200F and u+061C) uni_str = DirectionalFormattingCharacter.LRE.value + uni_str + DirectionalFormattingCharacter.PDF.value return uni_str From 547955085b48c02cb9d104e5cfe88ae1147ef26b Mon Sep 17 00:00:00 2001 From: cccs-rs <62077998+cccs-rs@users.noreply.github.com> Date: Fri, 7 Mar 2025 17:48:30 +0000 Subject: [PATCH 4/4] Insert PDF unicode character in test string to ensure proper wrapping --- test/test_str_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_str_utils.py b/test/test_str_utils.py index 1e885d161..e6cedb5b9 100644 --- a/test/test_str_utils.py +++ b/test/test_str_utils.py @@ -83,11 +83,11 @@ def test_remove_bidir_unicode_controls(): def test_wrap_bidir_unicode_string(): from assemblyline.common.str_utils import DirectionalFormattingCharacter as DFC - test_str = 'a'.join(str_utils.EO_CONTROL_CHARS) + 'a'.join(str_utils.I_CONTROL_CHARS) + test_str = 'a'.join(str_utils.EO_CONTROL_CHARS) + DFC.PDF.value + 'a'.join(str_utils.I_CONTROL_CHARS) a = str_utils.wrap_bidir_unicode_string(test_str) assert a == DFC.LRE.value + \ test_str + \ - len(str_utils.EO_CONTROL_CHARS) * DFC.PDF.value + \ + (len(str_utils.EO_CONTROL_CHARS) - 1) * DFC.PDF.value + \ len(str_utils.I_CONTROL_CHARS) * DFC.PDI.value + \ DFC.PDF.value