From 318076254d851b1678898c9b2a749b0b760c0fe5 Mon Sep 17 00:00:00 2001
From: Ludy <Ludy87@users.noreply.github.com>
Date: Sun, 28 Apr 2024 00:26:12 +0200
Subject: [PATCH] Handling Untranslatable Strings (#1133)

---
 .github/workflows/sync_files.yml |   3 +
 HowToAddNewLanguage.md           |  13 +++
 scripts/counter_translation.py   | 148 +++++++++++++++++++++--------
 scripts/translation_status.toml  | 154 +++++++++++++++++++++++++++++++
 4 files changed, 279 insertions(+), 39 deletions(-)
 create mode 100644 scripts/translation_status.toml

diff --git a/.github/workflows/sync_files.yml b/.github/workflows/sync_files.yml
index eba5a019..ac1556cd 100644
--- a/.github/workflows/sync_files.yml
+++ b/.github/workflows/sync_files.yml
@@ -7,6 +7,7 @@ on:
     paths:
       - "build.gradle"
       - "src/main/resources/messages_*.properties"
+      - "scripts/translation_status.toml"
 
 permissions:
   contents: write
@@ -58,6 +59,8 @@ jobs:
         uses: actions/setup-python@v5.1.0
         with:
           python-version: "3.x"
+      - name: Install dependencies
+        run: pip install tomlkit
       - name: Sync README
         run: python scripts/counter_translation.py
       - name: Set up git config
diff --git a/HowToAddNewLanguage.md b/HowToAddNewLanguage.md
index 385ccbbc..73cad0e9 100644
--- a/HowToAddNewLanguage.md
+++ b/HowToAddNewLanguage.md
@@ -34,5 +34,18 @@ Then simply translate all property entries within that file and make a PR into m
 
 If you do not have a java IDE i am happy to verify the changes worked once you raise PR (but won't be able to verify the translations themselves)
 
+## Handling Untranslatable Strings
 
+Sometimes, certain strings in the properties file may not require translation because they are the same in the target language or are universal (like names of protocols, certain terminologies, etc.). To ensure accurate statistics for language progress, these strings should be added to the `ignore_translation.toml` file located in the `scripts` directory. This will exclude them from the translation progress calculations.
 
+For example, if the English string error=Error does not need translation in Polish, add it to the ignore_translation.toml under the Polish section:
+
+```toml
+[pl_PL]
+ignore = [
+    "language.direction",  # Existing entries
+    "error"                # Add new entries here
+]
+```
+
+Make sure to place the entry under the correct language section. This helps maintain the accuracy of translation progress statistics and ensures that the translation tool or scripts do not misinterpret the completion rate.
diff --git a/scripts/counter_translation.py b/scripts/counter_translation.py
index c7f57575..a4b6255b 100644
--- a/scripts/counter_translation.py
+++ b/scripts/counter_translation.py
@@ -10,49 +10,77 @@ Author: Ludy87
 Example:
     To use this script, simply run it from command line:
         $ python counter_translation.py
-"""
-import os
+"""  # noqa: D205
+
 import glob
+import os
 import re
-from typing import List, Tuple
+
+import tomlkit
+import tomlkit.toml_file
 
 
-def write_readme(progress_list: List[Tuple[str, int]]) -> None:
-    """
-    Updates the progress status in the README.md file based
+def convert_to_multiline(data: tomlkit.TOMLDocument) -> tomlkit.TOMLDocument:
+    """Converts 'ignore' and 'missing' arrays to multiline arrays and sorts the first-level keys of the TOML document.
+    Enhances readability and consistency in the TOML file by ensuring arrays contain unique and sorted entries.
+
+    Parameters:
+        data (tomlkit.TOMLDocument): The original TOML document containing the data.
+
+    Returns:
+        tomlkit.TOMLDocument: A new TOML document with sorted keys and properly formatted arrays.
+    """  # noqa: D205
+    sorted_data = tomlkit.document()
+    for key in sorted(data.keys()):
+        value = data[key]
+        if isinstance(value, dict):
+            new_table = tomlkit.table()
+            for subkey in ("ignore", "missing"):
+                if subkey in value:
+                    # Convert the list to a set to remove duplicates, sort it, and convert to multiline for readability
+                    unique_sorted_array = sorted(set(value[subkey]))
+                    array = tomlkit.array()
+                    array.multiline(True)
+                    for item in unique_sorted_array:
+                        array.append(item)
+                    new_table[subkey] = array
+            sorted_data[key] = new_table
+        else:
+            # Add other types of data unchanged
+            sorted_data[key] = value
+    return sorted_data
+
+
+def write_readme(progress_list: list[tuple[str, int]]) -> None:
+    """Updates the progress status in the README.md file based
     on the provided progress list.
 
     Parameters:
-        progress_list (List[Tuple[str, int]]): A list of tuples containing
+        progress_list (list[tuple[str, int]]): A list of tuples containing
         language and progress percentage.
 
     Returns:
         None
-    """
-    with open("README.md", "r", encoding="utf-8") as file:
-        content = file.read()
+    """  # noqa: D205
+    with open("README.md", encoding="utf-8") as file:
+        content = file.readlines()
 
-    lines = content.split("\n")
-    for i, line in enumerate(lines[2:], start=2):
+    for i, line in enumerate(content[2:], start=2):
         for progress in progress_list:
             language, value = progress
             if language in line:
-                match = re.search(r"\!\[(\d+(\.\d+)?)%\]\(.*\)", line)
-                if match:
-                    lines[i] = line.replace(
+                if match := re.search(r"\!\[(\d+(\.\d+)?)%\]\(.*\)", line):
+                    content[i] = line.replace(
                         match.group(0),
                         f"![{value}%](https://geps.dev/progress/{value})",
                     )
 
-    new_content = "\n".join(lines)
-
     with open("README.md", "w", encoding="utf-8") as file:
-        file.write(new_content)
+        file.writelines(content)
 
 
-def compare_files(default_file_path, files_directory) -> List[Tuple[str, int]]:
-    """
-    Compares the default properties file with other
+def compare_files(default_file_path, file_paths, translation_status_file) -> list[tuple[str, int]]:
+    """Compares the default properties file with other
     properties files in the directory.
 
     Parameters:
@@ -60,20 +88,22 @@ def compare_files(default_file_path, files_directory) -> List[Tuple[str, int]]:
         files_directory (str): The directory containing other properties files.
 
     Returns:
-        List[Tuple[str, int]]: A list of tuples containing
+        list[tuple[str, int]]: A list of tuples containing
         language and progress percentage.
-    """
-    file_paths = glob.glob(os.path.join(files_directory, "messages_*.properties"))
-    num_lines = sum(1 for _ in open(default_file_path, encoding="utf-8"))
+    """  # noqa: D205
+    num_lines = sum(
+        1 for line in open(default_file_path, encoding="utf-8") if line.strip() and not line.strip().startswith("#")
+    )
 
     result_list = []
+    sort_translation_status: tomlkit.TOMLDocument
+
+    # read toml
+    with open(translation_status_file, encoding="utf-8") as f:
+        sort_translation_status = tomlkit.parse(f.read())
 
     for file_path in file_paths:
-        language = (
-            os.path.basename(file_path)
-            .split("messages_", 1)[1]
-            .split(".properties", 1)[0]
-        )
+        language = os.path.basename(file_path).split("messages_", 1)[1].split(".properties", 1)[0]
 
         fails = 0
         if "en_GB" in language or "en_US" in language:
@@ -81,9 +111,21 @@ def compare_files(default_file_path, files_directory) -> List[Tuple[str, int]]:
             result_list.append(("en_US", 100))
             continue
 
-        with open(default_file_path, "r", encoding="utf-8") as default_file, open(
-            file_path, "r", encoding="utf-8"
-        ) as file:
+        if language not in sort_translation_status:
+            sort_translation_status[language] = tomlkit.table()
+
+        if (
+            "ignore" not in sort_translation_status[language]
+            or len(sort_translation_status[language].get("ignore", [])) < 1
+        ):
+            sort_translation_status[language]["ignore"] = tomlkit.array(["language.direction"])
+
+        # if "missing" not in sort_translation_status[language]:
+        #     sort_translation_status[language]["missing"] = tomlkit.array()
+        # elif "language.direction" in sort_translation_status[language]["missing"]:
+        #     sort_translation_status[language]["missing"].remove("language.direction")
+
+        with open(default_file_path, encoding="utf-8") as default_file, open(file_path, encoding="utf-8") as file:
             for _ in range(5):
                 next(default_file)
                 try:
@@ -91,24 +133,47 @@ def compare_files(default_file_path, files_directory) -> List[Tuple[str, int]]:
                 except StopIteration:
                     fails = num_lines
 
-            for _, (line_default, line_file) in enumerate(
-                zip(default_file, file), start=6
-            ):
+            for line_num, (line_default, line_file) in enumerate(zip(default_file, file), start=6):
                 try:
+                    # Ignoring empty lines and lines start with #
+                    if line_default.strip() == "" or line_default.startswith("#"):
+                        continue
+
+                    default_key, default_value = line_default.split("=", 1)
+                    file_key, file_value = line_file.split("=", 1)
                     if (
-                        line_default.split("=", 1)[1].strip()
-                        == line_file.split("=", 1)[1].strip()
+                        default_value.strip() == file_value.strip()
+                        and default_key.strip() not in sort_translation_status[language]["ignore"]
                     ):
+                        print(f"{language}: Line {line_num} is missing the translation.")
+                        # if default_key.strip() not in sort_translation_status[language]["missing"]:
+                        #     missing_array = tomlkit.array()
+                        #     missing_array.append(default_key.strip())
+                        #     missing_array.multiline(True)
+                        #     sort_translation_status[language]["missing"].extend(missing_array)
                         fails += 1
+                    # elif default_key.strip() in sort_translation_status[language]["ignore"]:
+                    #     if default_key.strip() in sort_translation_status[language]["missing"]:
+                    #         sort_translation_status[language]["missing"].remove(default_key.strip())
+                    if default_value.strip() != file_value.strip():
+                        # if default_key.strip() in sort_translation_status[language]["missing"]:
+                        #     sort_translation_status[language]["missing"].remove(default_key.strip())
+                        if default_key.strip() in sort_translation_status[language]["ignore"]:
+                            sort_translation_status[language]["ignore"].remove(default_key.strip())
+
                 except IndexError:
                     pass
 
+        print(f"{language}: {fails} out of {num_lines} lines are not translated.")
         result_list.append(
             (
                 language,
                 int((num_lines - fails) * 100 / num_lines),
             )
         )
+    translation_status = convert_to_multiline(sort_translation_status)
+    with open(translation_status_file, "w", encoding="utf-8") as file:
+        file.write(tomlkit.dumps(translation_status))
 
     unique_data = list(set(result_list))
     unique_data.sort(key=lambda x: x[1], reverse=True)
@@ -118,5 +183,10 @@ def compare_files(default_file_path, files_directory) -> List[Tuple[str, int]]:
 
 if __name__ == "__main__":
     directory = os.path.join(os.getcwd(), "src", "main", "resources")
+    messages_file_paths = glob.glob(os.path.join(directory, "messages_*.properties"))
     reference_file = os.path.join(directory, "messages_en_GB.properties")
-    write_readme(compare_files(reference_file, directory))
+
+    scripts_directory = os.path.join(os.getcwd(), "scripts")
+    translation_state_file = os.path.join(scripts_directory, "translation_status.toml")
+
+    write_readme(compare_files(reference_file, messages_file_paths, translation_state_file))
diff --git a/scripts/translation_status.toml b/scripts/translation_status.toml
new file mode 100644
index 00000000..335ef567
--- /dev/null
+++ b/scripts/translation_status.toml
@@ -0,0 +1,154 @@
+[ar_AR]
+ignore = [
+    'language.direction',
+]
+
+[bg_BG]
+ignore = [
+    'language.direction',
+]
+
+[ca_CA]
+ignore = [
+    'language.direction',
+]
+
+[de_DE]
+ignore = [
+    'AddStampRequest.alphabet',
+    'AddStampRequest.position',
+    'PDFToBook.selectText.1',
+    'PDFToText.tags',
+    'addPageNumbers.selectText.3',
+    'alphabet',
+    'certSign.name',
+    'language.direction',
+    'licenses.version',
+    'pipeline.title',
+    'pipelineOptions.pipelineHeader',
+    'sponsor',
+    'text',
+    'watermark.type.1',
+]
+
+[el_GR]
+ignore = [
+    'language.direction',
+]
+
+[es_ES]
+ignore = [
+    'adminUserSettings.roles',
+    'color',
+    'language.direction',
+    'no',
+    'showJS.tags',
+]
+
+[eu_ES]
+ignore = [
+    'language.direction',
+]
+
+[fr_FR]
+ignore = [
+    'language.direction',
+]
+
+[hi_IN]
+ignore = [
+    'language.direction',
+]
+
+[hu_HU]
+ignore = [
+    'language.direction',
+]
+
+[id_ID]
+ignore = [
+    'language.direction',
+]
+
+[it_IT]
+ignore = [
+    'font',
+    'language.direction',
+    'no',
+    'password',
+    'pipeline.title',
+    'pipelineOptions.pipelineHeader',
+    'removePassword.selectText.2',
+    'showJS.tags',
+    'sponsor',
+]
+
+[ja_JP]
+ignore = [
+    'language.direction',
+]
+
+[ko_KR]
+ignore = [
+    'language.direction',
+]
+
+[nl_NL]
+ignore = [
+    'language.direction',
+]
+
+[pl_PL]
+ignore = [
+    'language.direction',
+]
+
+[pt_BR]
+ignore = [
+    'language.direction',
+]
+
+[pt_PT]
+ignore = [
+    'language.direction',
+]
+
+[ro_RO]
+ignore = [
+    'language.direction',
+]
+
+[ru_RU]
+ignore = [
+    'language.direction',
+]
+
+[sr_LATN_RS]
+ignore = [
+    'language.direction',
+]
+
+[sv_SE]
+ignore = [
+    'language.direction',
+]
+
+[tr_TR]
+ignore = [
+    'language.direction',
+]
+
+[uk_UA]
+ignore = [
+    'language.direction',
+]
+
+[zh_CN]
+ignore = [
+    'language.direction',
+]
+
+[zh_TW]
+ignore = [
+    'language.direction',
+]