From 8cf291ebbd7f29102d6e08587d1a3827eff2905e Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 2 Jan 2025 15:37:58 +0800
Subject: [PATCH 01/19] chore: refactor partial code

---
 .../src/pipelines/indexing/__init__.py        |   8 +-
 .../pipelines/indexing/sql_pairs_deletion.py  |   2 +-
 .../indexing/sql_pairs_preparation.py         | 116 +++++++++---------
 .../web/v1/services/sql_pairs_preparation.py  |   2 +-
 4 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/wren-ai-service/src/pipelines/indexing/__init__.py b/wren-ai-service/src/pipelines/indexing/__init__.py
index 4b8685755..e2c821683 100644
--- a/wren-ai-service/src/pipelines/indexing/__init__.py
+++ b/wren-ai-service/src/pipelines/indexing/__init__.py
@@ -93,7 +93,9 @@ def __init__(self, sql_pairs_store: DocumentStore) -> None:
         self._sql_pairs_store = sql_pairs_store
 
     @component.output_types()
-    async def run(self, sql_pair_ids: List[str], id: Optional[str] = None) -> None:
+    async def run(
+        self, sql_pair_ids: List[str], project_id: Optional[str] = None
+    ) -> None:
         filters = {
             "operator": "AND",
             "conditions": [
@@ -101,9 +103,9 @@ async def run(self, sql_pair_ids: List[str], id: Optional[str] = None) -> None:
             ],
         }
 
-        if id:
+        if project_id:
             filters["conditions"].append(
-                {"field": "project_id", "operator": "==", "value": id}
+                {"field": "project_id", "operator": "==", "value": project_id}
             )
 
         return await self._sql_pairs_store.delete_documents(filters)
diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py b/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py
index 944258f82..42683cede 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py
@@ -20,7 +20,7 @@ async def delete_sql_pairs(
     sql_pair_ids: List[str],
     id: Optional[str] = None,
 ) -> None:
-    return await sql_pairs_cleaner.run(sql_pair_ids=sql_pair_ids, id=id)
+    return await sql_pairs_cleaner.run(sql_pair_ids=sql_pair_ids, project_id=id)
 
 
 ## End of Pipeline
diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
index fce06e624..839bb2cb6 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
@@ -21,7 +21,7 @@
 logger = logging.getLogger("wren-ai-service")
 
 
-sql_intention_generation_system_prompt = """
+_system_prompt = """
 ### TASK ###
 
 You are a data analyst great at generating the concise and readable summary of a SQL query.
@@ -40,7 +40,7 @@
 }
 """
 
-sql_intention_generation_user_prompt_template = """
+_user_prompt_template = """
 ### INPUT ###
 SQL: {{sql}}
 
@@ -49,27 +49,22 @@
 
 
 @component
-class SqlPairsDescriptionConverter:
+class SqlPairsConverter:
     @component.output_types(documents=List[Document])
-    def run(self, sql_pairs: List[Dict[str, Any]], id: Optional[str] = None):
-        logger.info("Converting SQL pairs to documents...")
+    def run(self, sql_pairs: List[Dict[str, Any]], project_id: Optional[str] = ""):
+        logger.info(f"Project ID: {project_id} Converting SQL pairs to documents...")
+
+        addition = {"project_id": project_id} if project_id else {}
 
         return {
             "documents": [
                 Document(
-                    id=str(uuid.uuid4()),
-                    meta=(
-                        {
-                            "sql_pair_id": sql_pair.get("id"),
-                            "project_id": id,
-                            "sql": sql_pair.get("sql"),
-                        }
-                        if id
-                        else {
-                            "sql_pair_id": sql_pair.get("id"),
-                            "sql": sql_pair.get("sql"),
-                        }
-                    ),
+                    id=sql_pair.get("id", str(uuid.uuid4())),
+                    meta={
+                        "sql_pair_id": sql_pair.get("id"),
+                        "sql": sql_pair.get("sql"),
+                        **addition,
+                    },
                     content=sql_pair.get("intention"),
                 )
                 for sql_pair in sql_pairs
@@ -89,16 +84,16 @@ def prompts(
 @observe(as_type="generation", capture_input=False)
 async def generate_sql_intention(
     prompts: List[dict],
-    sql_intention_generator: Any,
+    generator: Any,
 ) -> List[dict]:
     async def _task(prompt: str, generator: Any):
         return await generator(prompt=prompt.get("prompt"))
 
-    tasks = [_task(prompt, sql_intention_generator) for prompt in prompts]
+    tasks = [_task(prompt, generator) for prompt in prompts]
     return await asyncio.gather(*tasks)
 
 
-@observe()
+@observe(capture_input=False)
 def post_process(
     generate_sql_intention: List[dict],
     sql_pairs: List[SqlPair],
@@ -115,56 +110,54 @@ def post_process(
 
 
 @observe(capture_input=False)
-def convert_sql_pairs_to_documents(
+def to_documents(
     post_process: List[Dict[str, Any]],
-    sql_pairs_description_converter: SqlPairsDescriptionConverter,
-    id: Optional[str] = None,
+    document_converter: SqlPairsConverter,
+    project_id: Optional[str] = "",
 ) -> Dict[str, Any]:
-    return sql_pairs_description_converter.run(sql_pairs=post_process, id=id)
+    return document_converter.run(sql_pairs=post_process, project_id=project_id)
 
 
 @observe(capture_input=False, capture_output=False)
-async def embed_sql_pairs(
-    convert_sql_pairs_to_documents: Dict[str, Any],
-    document_embedder: Any,
+async def embedding(
+    to_documents: Dict[str, Any],
+    embedder: Any,
 ) -> Dict[str, Any]:
-    return await document_embedder.run(
-        documents=convert_sql_pairs_to_documents["documents"]
-    )
+    return await embedder.run(documents=to_documents["documents"])
 
 
 @observe(capture_input=False, capture_output=False)
-async def delete_sql_pairs(
-    sql_pairs_cleaner: SqlPairsCleaner,
+async def clean(
+    cleaner: SqlPairsCleaner,
     sql_pairs: List[SqlPair],
-    embed_sql_pairs: Dict[str, Any],
-    id: Optional[str] = None,
-) -> List[SqlPair]:
+    embedding: Dict[str, Any],
+    project_id: Optional[str] = "",
+) -> Dict[str, Any]:
     sql_pair_ids = [sql_pair.id for sql_pair in sql_pairs]
-    await sql_pairs_cleaner.run(sql_pair_ids=sql_pair_ids, id=id)
+    await cleaner.run(sql_pair_ids=sql_pair_ids, project_id=project_id)
 
-    return embed_sql_pairs
+    return embedding
 
 
 @observe(capture_input=False)
-async def write_sql_pairs(
-    embed_sql_pairs: Dict[str, Any],
-    sql_pairs_writer: AsyncDocumentWriter,
+async def write(
+    clean: Dict[str, Any],
+    writer: AsyncDocumentWriter,
 ) -> None:
-    return await sql_pairs_writer.run(documents=embed_sql_pairs["documents"])
+    return await writer.run(documents=clean["documents"])
 
 
 ## End of Pipeline
-class SqlIntentionGenerationResult(BaseModel):
+class SqlIntentionResult(BaseModel):
     intention: str
 
 
-SQL_INTENTION_GENERATION_MODEL_KWARGS = {
+_GENERATION_MODEL_KWARGS = {
     "response_format": {
         "type": "json_schema",
         "json_schema": {
             "name": "sql_intention_results",
-            "schema": SqlIntentionGenerationResult.model_json_schema(),
+            "schema": SqlIntentionResult.model_json_schema(),
         },
     }
 }
@@ -178,21 +171,19 @@ def __init__(
         document_store_provider: DocumentStoreProvider,
         **kwargs,
     ) -> None:
-        sql_pairs_store = document_store_provider.get_store(dataset_name="sql_pairs")
+        store = document_store_provider.get_store(dataset_name="sql_pairs")
 
         self._components = {
-            "sql_pairs_cleaner": SqlPairsCleaner(sql_pairs_store),
-            "prompt_builder": PromptBuilder(
-                template=sql_intention_generation_user_prompt_template
+            "cleaner": SqlPairsCleaner(store),
+            "prompt_builder": PromptBuilder(template=_user_prompt_template),
+            "generator": llm_provider.get_generator(
+                system_prompt=_system_prompt,
+                generation_kwargs=_GENERATION_MODEL_KWARGS,
             ),
-            "sql_intention_generator": llm_provider.get_generator(
-                system_prompt=sql_intention_generation_system_prompt,
-                generation_kwargs=SQL_INTENTION_GENERATION_MODEL_KWARGS,
-            ),
-            "document_embedder": embedder_provider.get_document_embedder(),
-            "sql_pairs_description_converter": SqlPairsDescriptionConverter(),
-            "sql_pairs_writer": AsyncDocumentWriter(
-                document_store=sql_pairs_store,
+            "embedder": embedder_provider.get_document_embedder(),
+            "document_converter": SqlPairsConverter(),
+            "writer": AsyncDocumentWriter(
+                document_store=store,
                 policy=DuplicatePolicy.OVERWRITE,
             ),
         }
@@ -203,14 +194,17 @@ def __init__(
 
     @observe(name="SQL Pairs Preparation")
     async def run(
-        self, sql_pairs: List[SqlPair], id: Optional[str] = None
+        self, sql_pairs: List[SqlPair], project_id: Optional[str] = ""
     ) -> Dict[str, Any]:
-        logger.info("SQL Pairs Preparation pipeline is running...")
+        logger.info(
+            f"Project ID: {project_id} SQL Pairs Preparation pipeline is running..."
+        )
+
         return await self._pipe.execute(
-            ["write_sql_pairs"],
+            ["write"],
             inputs={
                 "sql_pairs": sql_pairs,
-                "id": id or "",
+                "project_id": project_id,
                 **self._components,
             },
         )
diff --git a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
index 39a9f63cd..f7f249c8a 100644
--- a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
+++ b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
@@ -97,7 +97,7 @@ async def prepare_sql_pairs(
         try:
             await self._pipelines["sql_pairs_preparation"].run(
                 sql_pairs=prepare_sql_pairs_request.sql_pairs,
-                id=prepare_sql_pairs_request.project_id,
+                project_id=prepare_sql_pairs_request.project_id,
             )
 
             self._prepare_sql_pairs_statuses[

From f40e0bcfddae559a82cc0a0e0ee25490d0df6f6e Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 2 Jan 2025 15:49:40 +0800
Subject: [PATCH 02/19] feat: pushdown the sql pair class to the core

---
 .../src/pipelines/indexing/sql_pairs_preparation.py         | 6 +++++-
 .../src/web/v1/services/sql_pairs_preparation.py            | 6 +-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
index 839bb2cb6..971089ae9 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
@@ -16,7 +16,6 @@
 from src.core.pipeline import BasicPipeline
 from src.core.provider import DocumentStoreProvider, EmbedderProvider, LLMProvider
 from src.pipelines.indexing import AsyncDocumentWriter, SqlPairsCleaner
-from src.web.v1.services.sql_pairs_preparation import SqlPair
 
 logger = logging.getLogger("wren-ai-service")
 
@@ -48,6 +47,11 @@
 """
 
 
+class SqlPair(BaseModel):
+    sql: str
+    id: str
+
+
 @component
 class SqlPairsConverter:
     @component.output_types(documents=List[Document])
diff --git a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
index f7f249c8a..ca7d4ae11 100644
--- a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
+++ b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
@@ -6,17 +6,13 @@
 from pydantic import BaseModel
 
 from src.core.pipeline import BasicPipeline
+from src.pipelines.indexing.sql_pairs_preparation import SqlPair
 from src.utils import trace_metadata
 
 logger = logging.getLogger("wren-ai-service")
 
 
 # POST /v1/sql-pairs
-class SqlPair(BaseModel):
-    sql: str
-    id: str
-
-
 class SqlPairsPreparationRequest(BaseModel):
     _query_id: str | None = None
     sql_pairs: List[SqlPair]

From 5340ee727ac23baff8969a812856ae488a001319 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 2 Jan 2025 16:10:12 +0800
Subject: [PATCH 03/19] feat: tool to convert mdl to str for http request
 testing

---
 wren-ai-service/Justfile            |  3 ++
 wren-ai-service/tools/mdl_to_str.py | 45 +++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 wren-ai-service/tools/mdl_to_str.py

diff --git a/wren-ai-service/Justfile b/wren-ai-service/Justfile
index b52f7ed4a..89d8963bd 100644
--- a/wren-ai-service/Justfile
+++ b/wren-ai-service/Justfile
@@ -69,3 +69,6 @@ run-sql mdl_path="" data_source="" sample_dataset="":
 
 report-gen:
 	poetry run python eval/llm_trace_analysis/report_gen.py
+
+mdl-to-str mdl_path="":
+	poetry run python tools/mdl_to_str.py -p {{mdl_path}}
diff --git a/wren-ai-service/tools/mdl_to_str.py b/wren-ai-service/tools/mdl_to_str.py
new file mode 100644
index 000000000..cfab8f8bf
--- /dev/null
+++ b/wren-ai-service/tools/mdl_to_str.py
@@ -0,0 +1,45 @@
+import orjson
+
+
+def to_str(mdl: dict) -> str:
+    """Convert MDL dictionary to string format with proper escaping.
+
+    Args:
+        mdl (dict): The MDL dictionary containing schema information
+
+    Returns:
+        str: Properly escaped string representation of the MDL
+
+    Example:
+        mdl = {
+            "schema": "public",
+            "models": [
+                {"name": "table1"}
+            ]
+        }
+        result = to_str(mdl)
+        # Returns escaped string representation
+    """
+
+    mdl_str = orjson.dumps(mdl).decode("utf-8")
+
+    mdl_str = mdl_str.replace("\\", "\\\\")  # Escape backslashes
+    mdl_str = mdl_str.replace('"', '\\"')  # Escape double quotes
+
+    return mdl_str
+
+
+def _args():
+    parser = argparse.ArgumentParser(
+        description="Convert MDL JSON file to escaped string format"
+    )
+    parser.add_argument("-p", "--path", help="Path to input MDL JSON file")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    args = _args()
+    mdl = orjson.loads(open(args.path).read())
+    print(to_str(mdl))

From ea9926249996e53e9ea941311458f0bb588983a6 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 2 Jan 2025 17:53:33 +0800
Subject: [PATCH 04/19] feat: index sql pairs if MDL includes the key

---
 wren-ai-service/src/globals.py                 |  3 +++
 .../web/v1/services/semantics_preparation.py   | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/wren-ai-service/src/globals.py b/wren-ai-service/src/globals.py
index 737bfda1c..a17a07b77 100644
--- a/wren-ai-service/src/globals.py
+++ b/wren-ai-service/src/globals.py
@@ -76,6 +76,9 @@ def create_service_container(
                 "table_description": indexing.TableDescription(
                     **pipe_components["table_description_indexing"],
                 ),
+                "sql_pairs_preparation": indexing.SqlPairsPreparation(
+                    **pipe_components["sql_pairs_preparation"],
+                ),
             },
             **query_cache,
         ),
diff --git a/wren-ai-service/src/web/v1/services/semantics_preparation.py b/wren-ai-service/src/web/v1/services/semantics_preparation.py
index c4e23a286..f9540b301 100644
--- a/wren-ai-service/src/web/v1/services/semantics_preparation.py
+++ b/wren-ai-service/src/web/v1/services/semantics_preparation.py
@@ -2,11 +2,13 @@
 import logging
 from typing import Dict, Literal, Optional
 
+import orjson
 from cachetools import TTLCache
 from langfuse.decorators import observe
 from pydantic import AliasChoices, BaseModel, Field
 
 from src.core.pipeline import BasicPipeline
+from src.pipelines.indexing.sql_pairs_preparation import SqlPair
 from src.utils import trace_metadata
 
 logger = logging.getLogger("wren-ai-service")
@@ -55,6 +57,17 @@ def __init__(
             str, SemanticsPreparationStatusResponse
         ] = TTLCache(maxsize=maxsize, ttl=ttl)
 
+    def _sql_pairs(self, input: dict) -> asyncio.Task:
+        sql_pairs = [
+            SqlPair(**pair)
+            for pair in orjson.loads(input["mdl_str"]).get("sqlPairs", [])
+        ]
+
+        return self._pipelines["sql_pairs_preparation"].run(
+            sql_pairs=sql_pairs,
+            project_id=input["project_id"],
+        )
+
     @observe(name="Prepare Semantics")
     @trace_metadata
     async def prepare_semantics(
@@ -82,6 +95,11 @@ async def prepare_semantics(
                 for name in ["db_schema", "historical_question", "table_description"]
             ]
 
+            if "sqlPairs" in input["mdl_str"]:
+                # this is a temporary usage for embedding some sql pairs at MDL level
+                # will expect to remove or refactor this in the future
+                tasks.append(self._sql_pairs(input))
+
             await asyncio.gather(*tasks)
 
             self._prepare_semantics_statuses[

From e4df29b0361e1034ed544406281f44f256fc05da Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 2 Jan 2025 18:25:46 +0800
Subject: [PATCH 05/19] fix: failed test cases

---
 .../pytest/pipelines/indexing/test_sql_pairs_deletion.py    | 2 +-
 .../pytest/pipelines/indexing/test_sql_pairs_preparation.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
index 096f97c96..325beea3e 100644
--- a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
+++ b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
@@ -27,7 +27,7 @@ async def test_sql_pairs_deletion():
     )
     await sql_pairs_preparation.run(
         sql_pairs=sql_pairs,
-        id="fake-id",
+        project_id="fake-id",
     )
 
     sql_pairs_deletion = SqlPairsDeletion(**pipe_components["sql_pairs_deletion"])
diff --git a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_preparation.py b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_preparation.py
index f2eeefbee..8ef66ea66 100644
--- a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_preparation.py
+++ b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_preparation.py
@@ -25,7 +25,7 @@ async def test_sql_pairs_preparation_saving_to_document_store():
             SqlPair(sql="SELECT * FROM book", id="1"),
             SqlPair(sql="SELECT * FROM author", id="2"),
         ],
-        id="fake-id",
+        project_id="fake-id",
     )
 
     assert await store.count_documents() == 2
@@ -56,7 +56,7 @@ async def test_sql_pairs_preparation_saving_to_document_store_with_multiple_proj
             SqlPair(sql="SELECT * FROM book", id="1"),
             SqlPair(sql="SELECT * FROM author", id="2"),
         ],
-        id="fake-id",
+        project_id="fake-id",
     )
 
     await sql_pairs_preparation.run(
@@ -64,7 +64,7 @@ async def test_sql_pairs_preparation_saving_to_document_store_with_multiple_proj
             SqlPair(sql="SELECT * FROM book", id="1"),
             SqlPair(sql="SELECT * FROM author", id="2"),
         ],
-        id="fake-id-2",
+        project_id="fake-id-2",
     )
 
     assert await store.count_documents() == 4

From b7c759c1274870397a05040473002a15ca507235 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 2 Jan 2025 18:42:35 +0800
Subject: [PATCH 06/19] fix: correct the behavior for document id

---
 wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
index 971089ae9..8d0f219de 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
@@ -63,7 +63,7 @@ def run(self, sql_pairs: List[Dict[str, Any]], project_id: Optional[str] = ""):
         return {
             "documents": [
                 Document(
-                    id=sql_pair.get("id", str(uuid.uuid4())),
+                    id=str(uuid.uuid4()),
                     meta={
                         "sql_pair_id": sql_pair.get("id"),
                         "sql": sql_pair.get("sql"),

From 2819a4f096686694d44b617e2ff74cd7de22bc50 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Mon, 6 Jan 2025 17:30:24 +0800
Subject: [PATCH 07/19] feat: prompt enhancement for SQL pairs

---
 .../pipelines/generation/sql_generation.py    | 20 +++++++++----------
 .../src/pipelines/generation/utils/sql.py     | 20 +++++++++++++++++++
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/wren-ai-service/src/pipelines/generation/sql_generation.py b/wren-ai-service/src/pipelines/generation/sql_generation.py
index 3509d20ae..b8cffbac4 100644
--- a/wren-ai-service/src/pipelines/generation/sql_generation.py
+++ b/wren-ai-service/src/pipelines/generation/sql_generation.py
@@ -32,6 +32,16 @@
     {{ document }}
 {% endfor %}
 
+{% if sql_samples %}
+### SAMPLES ###
+{% for sample in sql_samples %}
+Summary:
+{{sample.summary}}
+SQL:
+{{sample.sql}}
+{% endfor %}
+{% endif %}
+
 {% if exclude %}
 ### EXCLUDED STATEMETS ###
 Ensure that the following excluded statements are not used in the generated queries to maintain variety and avoid repetition.
@@ -54,16 +64,6 @@
     ]
 }
 
-{% if sql_samples %}
-### SAMPLES ###
-{% for sample in sql_samples %}
-Summary:
-{{sample.summary}}
-SQL:
-{{sample.sql}}
-{% endfor %}
-{% endif %}
-
 ### QUESTION ###
 User's Question: {{ query }}
 Current Time: {{ current_time }}
diff --git a/wren-ai-service/src/pipelines/generation/utils/sql.py b/wren-ai-service/src/pipelines/generation/utils/sql.py
index b2b298876..d1750601e 100644
--- a/wren-ai-service/src/pipelines/generation/utils/sql.py
+++ b/wren-ai-service/src/pipelines/generation/utils/sql.py
@@ -471,6 +471,26 @@ async def _task(result: Dict[str, str]):
   PurchaseTimestamp >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month') AND
   PurchaseTimestamp < DATE_TRUNC('month', CURRENT_DATE)
 
+## LESSON 3 ##
+Finally, you will learn from the sample SQL queries provided in the input. These samples demonstrate best practices and common patterns for querying this specific database.
+
+For each sample, you should:
+1. Study the summary that explains what the query aims to accomplish
+2. Analyze the SQL implementation to understand:
+   - Table structures and relationships used
+   - Specific functions and operators employed
+   - Query patterns and techniques demonstrated
+3. Use these samples as reference patterns when generating similar queries
+4. Adapt the techniques shown in the samples to match new query requirements while maintaining consistent style and approach
+
+The samples will help you understand:
+- Preferred table join patterns
+- Common aggregation methods
+- Specific function usage
+- Query structure and formatting conventions
+
+When generating new queries, try to follow similar patterns when applicable, while adapting them to the specific requirements of each new query.
+
 Learn about the usage of the schema structures and generate SQL based on them.
 
 """

From 7d2b08883195d4c8a494da47afc9cb7ff04ca590 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Mon, 6 Jan 2025 18:31:28 +0800
Subject: [PATCH 08/19] feat: use question instead of sql summary

---
 .../src/pipelines/generation/sql_generation.py      |  4 ++--
 .../src/pipelines/generation/utils/sql.py           |  2 +-
 .../src/pipelines/indexing/sql_pairs_preparation.py | 13 ++++++++++---
 .../src/pipelines/retrieval/sql_pairs_retrieval.py  |  3 ++-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/wren-ai-service/src/pipelines/generation/sql_generation.py b/wren-ai-service/src/pipelines/generation/sql_generation.py
index b8cffbac4..3417f308a 100644
--- a/wren-ai-service/src/pipelines/generation/sql_generation.py
+++ b/wren-ai-service/src/pipelines/generation/sql_generation.py
@@ -35,8 +35,8 @@
 {% if sql_samples %}
 ### SAMPLES ###
 {% for sample in sql_samples %}
-Summary:
-{{sample.summary}}
+Question:
+{{sample.question}}
 SQL:
 {{sample.sql}}
 {% endfor %}
diff --git a/wren-ai-service/src/pipelines/generation/utils/sql.py b/wren-ai-service/src/pipelines/generation/utils/sql.py
index d1750601e..08f15c85e 100644
--- a/wren-ai-service/src/pipelines/generation/utils/sql.py
+++ b/wren-ai-service/src/pipelines/generation/utils/sql.py
@@ -475,7 +475,7 @@ async def _task(result: Dict[str, str]):
 Finally, you will learn from the sample SQL queries provided in the input. These samples demonstrate best practices and common patterns for querying this specific database.
 
 For each sample, you should:
-1. Study the summary that explains what the query aims to accomplish
+1. Study the question that explains what the query aims to accomplish
 2. Analyze the SQL implementation to understand:
    - Table structures and relationships used
    - Specific functions and operators employed
diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
index 8d0f219de..c6a74b1dd 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
@@ -48,8 +48,9 @@
 
 
 class SqlPair(BaseModel):
-    sql: str
     id: str
+    sql: str
+    question: str
 
 
 @component
@@ -67,9 +68,10 @@ def run(self, sql_pairs: List[Dict[str, Any]], project_id: Optional[str] = ""):
                     meta={
                         "sql_pair_id": sql_pair.get("id"),
                         "sql": sql_pair.get("sql"),
+                        "intention": sql_pair.get("intention"),
                         **addition,
                     },
-                    content=sql_pair.get("intention"),
+                    content=sql_pair.get("question"),
                 )
                 for sql_pair in sql_pairs
             ]
@@ -108,7 +110,12 @@ def post_process(
     ]
 
     return [
-        {"id": sql_pair.id, "sql": sql_pair.sql, "intention": intention}
+        {
+            "id": sql_pair.id,
+            "sql": sql_pair.sql,
+            "question": sql_pair.question,
+            "intention": intention,
+        }
         for sql_pair, intention in zip(sql_pairs, intentions)
     ]
 
diff --git a/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py b/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py
index abba3b016..4d86846ee 100644
--- a/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py
+++ b/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py
@@ -25,7 +25,8 @@ def run(self, documents: List[Document]):
 
         for doc in documents:
             formatted = {
-                "summary": doc.content,
+                "question": doc.content,
+                "intention": doc.meta.get("intention"),
                 "sql": doc.meta.get("sql"),
             }
             list.append(formatted)

From 6d5639302b63db611c93a85f5b6f838f9f15a629 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Wed, 8 Jan 2025 18:58:35 +0800
Subject: [PATCH 09/19] feat: sql pairs from external file

---
 deployment/kustomizations/base/cm.yaml        |   3 +-
 docker/config.example.yaml                    |   3 +-
 wren-ai-service/src/config.py                 |   2 +
 wren-ai-service/src/globals.py                |   9 +-
 .../src/pipelines/indexing/__init__.py        |   4 +-
 ...{sql_pairs_preparation.py => sql_pairs.py} | 147 +++++++-----------
 .../web/v1/services/semantics_preparation.py  |  25 +--
 .../web/v1/services/sql_pairs_preparation.py  |   2 +-
 .../tools/config/config.example.yaml          |   3 +-
 wren-ai-service/tools/config/config.full.yaml |   3 +-
 10 files changed, 72 insertions(+), 129 deletions(-)
 rename wren-ai-service/src/pipelines/indexing/{sql_pairs_preparation.py => sql_pairs.py} (53%)

diff --git a/deployment/kustomizations/base/cm.yaml b/deployment/kustomizations/base/cm.yaml
index 02c6ae447..654d43c96 100644
--- a/deployment/kustomizations/base/cm.yaml
+++ b/deployment/kustomizations/base/cm.yaml
@@ -152,10 +152,9 @@ data:
         document_store: qdrant
       - name: data_assistance
         llm: litellm_llm.gpt-4o-mini-2024-07-18
-      - name: sql_pairs_preparation
+      - name: sql_pairs_indexing
         document_store: qdrant
         embedder: openai_embedder.text-embedding-3-large
-        llm: litellm_llm.gpt-4o-mini-2024-07-18
       - name: sql_pairs_deletion
         document_store: qdrant
         embedder: openai_embedder.text-embedding-3-large 
diff --git a/docker/config.example.yaml b/docker/config.example.yaml
index 25cadacf1..2265b4d28 100644
--- a/docker/config.example.yaml
+++ b/docker/config.example.yaml
@@ -104,10 +104,9 @@ pipes:
     document_store: qdrant
   - name: data_assistance
     llm: litellm_llm.gpt-4o-mini-2024-07-18
-  - name: sql_pairs_preparation
+  - name: sql_pairs_indexing
     document_store: qdrant
     embedder: openai_embedder.text-embedding-3-large
-    llm: litellm_llm.gpt-4o-mini-2024-07-18
   - name: sql_pairs_deletion
     document_store: qdrant
     embedder: openai_embedder.text-embedding-3-large 
diff --git a/wren-ai-service/src/config.py b/wren-ai-service/src/config.py
index 4ed5ca697..a41dcfc7e 100644
--- a/wren-ai-service/src/config.py
+++ b/wren-ai-service/src/config.py
@@ -54,6 +54,8 @@ class Settings(BaseSettings):
     config_path: str = Field(default="config.yaml")
     _components: list[dict]
 
+    sql_pairs_path: str = Field(default="sql_pairs.json")
+
     def __init__(self):
         load_dotenv(".env.dev", override=True)
         super().__init__()
diff --git a/wren-ai-service/src/globals.py b/wren-ai-service/src/globals.py
index a17a07b77..3622c7cc5 100644
--- a/wren-ai-service/src/globals.py
+++ b/wren-ai-service/src/globals.py
@@ -76,8 +76,9 @@ def create_service_container(
                 "table_description": indexing.TableDescription(
                     **pipe_components["table_description_indexing"],
                 ),
-                "sql_pairs_preparation": indexing.SqlPairsPreparation(
-                    **pipe_components["sql_pairs_preparation"],
+                "sql_pairs": indexing.SqlPairs(
+                    **pipe_components["sql_pairs_indexing"],
+                    sql_pairs_path=settings.sql_pairs_path,
                 ),
             },
             **query_cache,
@@ -223,8 +224,8 @@ def create_service_container(
         ),
         sql_pairs_preparation_service=SqlPairsPreparationService(
             pipelines={
-                "sql_pairs_preparation": indexing.SqlPairsPreparation(
-                    **pipe_components["sql_pairs_preparation"],
+                "sql_pairs_preparation": indexing.SqlPairs(
+                    **pipe_components["sql_pairs_indexing"],
                 ),
                 "sql_pairs_deletion": indexing.SqlPairsDeletion(
                     **pipe_components["sql_pairs_deletion"],
diff --git a/wren-ai-service/src/pipelines/indexing/__init__.py b/wren-ai-service/src/pipelines/indexing/__init__.py
index e2c821683..bf138f49c 100644
--- a/wren-ai-service/src/pipelines/indexing/__init__.py
+++ b/wren-ai-service/src/pipelines/indexing/__init__.py
@@ -114,8 +114,8 @@ async def run(
 # Put the pipelines imports here to avoid circular imports and make them accessible directly to the rest of packages
 from .db_schema import DBSchema  # noqa: E402
 from .historical_question import HistoricalQuestion  # noqa: E402
+from .sql_pairs import SqlPairs  # noqa: E402
 from .sql_pairs_deletion import SqlPairsDeletion  # noqa: E402
-from .sql_pairs_preparation import SqlPairsPreparation  # noqa: E402
 from .table_description import TableDescription  # noqa: E402
 
 __all__ = [
@@ -123,5 +123,5 @@ async def run(
     "TableDescription",
     "HistoricalQuestion",
     "SqlPairsDeletion",
-    "SqlPairsPreparation",
+    "SqlPairs",
 ]
diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py b/wren-ai-service/src/pipelines/indexing/sql_pairs.py
similarity index 53%
rename from wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
rename to wren-ai-service/src/pipelines/indexing/sql_pairs.py
index c6a74b1dd..b8616198d 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs_preparation.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs.py
@@ -1,5 +1,5 @@
-import asyncio
 import logging
+import os
 import sys
 import uuid
 from typing import Any, Dict, List, Optional
@@ -8,45 +8,17 @@
 from hamilton import base
 from hamilton.async_driver import AsyncDriver
 from haystack import Document, component
-from haystack.components.builders.prompt_builder import PromptBuilder
 from haystack.document_stores.types import DuplicatePolicy
 from langfuse.decorators import observe
 from pydantic import BaseModel
 
 from src.core.pipeline import BasicPipeline
-from src.core.provider import DocumentStoreProvider, EmbedderProvider, LLMProvider
+from src.core.provider import DocumentStoreProvider, EmbedderProvider
 from src.pipelines.indexing import AsyncDocumentWriter, SqlPairsCleaner
 
 logger = logging.getLogger("wren-ai-service")
 
 
-_system_prompt = """
-### TASK ###
-
-You are a data analyst great at generating the concise and readable summary of a SQL query.
-
-### INSTRUCTIONS ###
-
-- Summary should be concise and readable.
-- Summary should be no longer than 20 words.
-- Don't rephrase keywords in the SQL query, just use them as they are.
-
-### OUTPUT ###
-
-You need to output a JSON object as following:
-{
-    "intention": "<CONCISE_AND_READABLE_SUMMARY_STRING>"
-}
-"""
-
-_user_prompt_template = """
-### INPUT ###
-SQL: {{sql}}
-
-Please think step by step
-"""
-
-
 class SqlPair(BaseModel):
     id: str
     sql: str
@@ -56,7 +28,7 @@ class SqlPair(BaseModel):
 @component
 class SqlPairsConverter:
     @component.output_types(documents=List[Document])
-    def run(self, sql_pairs: List[Dict[str, Any]], project_id: Optional[str] = ""):
+    def run(self, sql_pairs: List[SqlPair], project_id: Optional[str] = ""):
         logger.info(f"Project ID: {project_id} Converting SQL pairs to documents...")
 
         addition = {"project_id": project_id} if project_id else {}
@@ -66,12 +38,11 @@ def run(self, sql_pairs: List[Dict[str, Any]], project_id: Optional[str] = ""):
                 Document(
                     id=str(uuid.uuid4()),
                     meta={
-                        "sql_pair_id": sql_pair.get("id"),
-                        "sql": sql_pair.get("sql"),
-                        "intention": sql_pair.get("intention"),
+                        "sql_pair_id": sql_pair.id,
+                        "sql": sql_pair.sql,
                         **addition,
                     },
-                    content=sql_pair.get("question"),
+                    content=sql_pair.question,
                 )
                 for sql_pair in sql_pairs
             ]
@@ -80,53 +51,42 @@ def run(self, sql_pairs: List[Dict[str, Any]], project_id: Optional[str] = ""):
 
 ## Start of Pipeline
 @observe(capture_input=False)
-def prompts(
-    sql_pairs: List[SqlPair],
-    prompt_builder: PromptBuilder,
-) -> List[dict]:
-    return [prompt_builder.run(sql=sql_pair.sql) for sql_pair in sql_pairs]
-
-
-@observe(as_type="generation", capture_input=False)
-async def generate_sql_intention(
-    prompts: List[dict],
-    generator: Any,
-) -> List[dict]:
-    async def _task(prompt: str, generator: Any):
-        return await generator(prompt=prompt.get("prompt"))
-
-    tasks = [_task(prompt, generator) for prompt in prompts]
-    return await asyncio.gather(*tasks)
+def boilerplates(
+    mdl_str: str,
+) -> List[str]:
+    mdl = orjson.loads(mdl_str)
+
+    return {
+        boilerplate.lower()
+        for model in mdl.get("models", [])
+        if (boilerplate := model.get("properties", {}).get("boilerplate"))
+    }
 
 
 @observe(capture_input=False)
-def post_process(
-    generate_sql_intention: List[dict],
-    sql_pairs: List[SqlPair],
-) -> List[Dict[str, Any]]:
-    intentions = [
-        orjson.loads(result["replies"][0])["intention"]
-        for result in generate_sql_intention
-    ]
-
+def sql_pairs(
+    boilerplates: List[str],
+    external_pairs: Dict[str, Any],
+) -> List[SqlPair]:
     return [
-        {
-            "id": sql_pair.id,
-            "sql": sql_pair.sql,
-            "question": sql_pair.question,
-            "intention": intention,
-        }
-        for sql_pair, intention in zip(sql_pairs, intentions)
+        SqlPair(
+            id=pair.get("id"),
+            question=pair.get("question"),
+            sql=pair.get("sql"),
+        )
+        for boilerplate in boilerplates
+        if boilerplate in external_pairs
+        for pair in external_pairs[boilerplate]
     ]
 
 
 @observe(capture_input=False)
 def to_documents(
-    post_process: List[Dict[str, Any]],
+    sql_pairs: List[SqlPair],
     document_converter: SqlPairsConverter,
     project_id: Optional[str] = "",
 ) -> Dict[str, Any]:
-    return document_converter.run(sql_pairs=post_process, project_id=project_id)
+    return document_converter.run(sql_pairs=sql_pairs, project_id=project_id)
 
 
 @observe(capture_input=False, capture_output=False)
@@ -159,62 +119,59 @@ async def write(
 
 
 ## End of Pipeline
-class SqlIntentionResult(BaseModel):
-    intention: str
 
 
-_GENERATION_MODEL_KWARGS = {
-    "response_format": {
-        "type": "json_schema",
-        "json_schema": {
-            "name": "sql_intention_results",
-            "schema": SqlIntentionResult.model_json_schema(),
-        },
-    }
-}
+def _load_sql_pairs(sql_pairs_path: str) -> Dict[str, Any]:
+    if not sql_pairs_path:
+        return {}
+
+    if not os.path.exists(sql_pairs_path):
+        logger.warning(f"SQL pairs file not found: {sql_pairs_path}")
+        return {}
 
+    with open(sql_pairs_path, "r") as file:
+        return orjson.loads(file.read())
 
-class SqlPairsPreparation(BasicPipeline):
+
+class SqlPairs(BasicPipeline):
     def __init__(
         self,
         embedder_provider: EmbedderProvider,
-        llm_provider: LLMProvider,
         document_store_provider: DocumentStoreProvider,
+        sql_pairs_path: Optional[str] = "sql_pairs.json",
         **kwargs,
     ) -> None:
         store = document_store_provider.get_store(dataset_name="sql_pairs")
 
         self._components = {
             "cleaner": SqlPairsCleaner(store),
-            "prompt_builder": PromptBuilder(template=_user_prompt_template),
-            "generator": llm_provider.get_generator(
-                system_prompt=_system_prompt,
-                generation_kwargs=_GENERATION_MODEL_KWARGS,
-            ),
             "embedder": embedder_provider.get_document_embedder(),
             "document_converter": SqlPairsConverter(),
             "writer": AsyncDocumentWriter(
                 document_store=store,
                 policy=DuplicatePolicy.OVERWRITE,
             ),
+            "external_pairs": _load_sql_pairs(sql_pairs_path),
         }
 
         super().__init__(
             AsyncDriver({}, sys.modules[__name__], result_builder=base.DictResult())
         )
 
-    @observe(name="SQL Pairs Preparation")
+    @observe(name="SQL Pairs Indexing")
     async def run(
-        self, sql_pairs: List[SqlPair], project_id: Optional[str] = ""
+        self,
+        mdl_str: str,
+        project_id: Optional[str] = "",
     ) -> Dict[str, Any]:
         logger.info(
-            f"Project ID: {project_id} SQL Pairs Preparation pipeline is running..."
+            f"Project ID: {project_id} SQL Pairs Indexing pipeline is running..."
         )
 
         return await self._pipe.execute(
             ["write"],
             inputs={
-                "sql_pairs": sql_pairs,
+                "mdl_str": mdl_str,
                 "project_id": project_id,
                 **self._components,
             },
@@ -225,7 +182,7 @@ async def run(
     from src.pipelines.common import dry_run_pipeline
 
     dry_run_pipeline(
-        SqlPairsPreparation,
-        "sql_pairs_preparation",
-        sql_pairs=[],
+        SqlPairs,
+        "sql_pairs_indexing",
+        mdl_str='{"models": [{"properties": {"boilerplate": "hubspot"}}]}',
     )
diff --git a/wren-ai-service/src/web/v1/services/semantics_preparation.py b/wren-ai-service/src/web/v1/services/semantics_preparation.py
index f9540b301..0be60a599 100644
--- a/wren-ai-service/src/web/v1/services/semantics_preparation.py
+++ b/wren-ai-service/src/web/v1/services/semantics_preparation.py
@@ -2,13 +2,11 @@
 import logging
 from typing import Dict, Literal, Optional
 
-import orjson
 from cachetools import TTLCache
 from langfuse.decorators import observe
 from pydantic import AliasChoices, BaseModel, Field
 
 from src.core.pipeline import BasicPipeline
-from src.pipelines.indexing.sql_pairs_preparation import SqlPair
 from src.utils import trace_metadata
 
 logger = logging.getLogger("wren-ai-service")
@@ -57,17 +55,6 @@ def __init__(
             str, SemanticsPreparationStatusResponse
         ] = TTLCache(maxsize=maxsize, ttl=ttl)
 
-    def _sql_pairs(self, input: dict) -> asyncio.Task:
-        sql_pairs = [
-            SqlPair(**pair)
-            for pair in orjson.loads(input["mdl_str"]).get("sqlPairs", [])
-        ]
-
-        return self._pipelines["sql_pairs_preparation"].run(
-            sql_pairs=sql_pairs,
-            project_id=input["project_id"],
-        )
-
     @observe(name="Prepare Semantics")
     @trace_metadata
     async def prepare_semantics(
@@ -92,14 +79,14 @@ async def prepare_semantics(
 
             tasks = [
                 self._pipelines[name].run(**input)
-                for name in ["db_schema", "historical_question", "table_description"]
+                for name in [
+                    "db_schema",
+                    "historical_question",
+                    "table_description",
+                    "sql_pairs",
+                ]
             ]
 
-            if "sqlPairs" in input["mdl_str"]:
-                # this is a temporary usage for embedding some sql pairs at MDL level
-                # will expect to remove or refactor this in the future
-                tasks.append(self._sql_pairs(input))
-
             await asyncio.gather(*tasks)
 
             self._prepare_semantics_statuses[
diff --git a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
index ca7d4ae11..e90598943 100644
--- a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
+++ b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
@@ -6,7 +6,7 @@
 from pydantic import BaseModel
 
 from src.core.pipeline import BasicPipeline
-from src.pipelines.indexing.sql_pairs_preparation import SqlPair
+from src.pipelines.indexing.sql_pairs import SqlPair
 from src.utils import trace_metadata
 
 logger = logging.getLogger("wren-ai-service")
diff --git a/wren-ai-service/tools/config/config.example.yaml b/wren-ai-service/tools/config/config.example.yaml
index 0d1d8344e..3d3551517 100644
--- a/wren-ai-service/tools/config/config.example.yaml
+++ b/wren-ai-service/tools/config/config.example.yaml
@@ -122,10 +122,9 @@ pipes:
     document_store: qdrant
   - name: data_assistance
     llm: litellm_llm.gpt-4o-mini-2024-07-18
-  - name: sql_pairs_preparation
+  - name: sql_pairs_indexing
     document_store: qdrant
     embedder: openai_embedder.text-embedding-3-large
-    llm: litellm_llm.gpt-4o-mini-2024-07-18
   - name: sql_pairs_deletion
     document_store: qdrant
     embedder: openai_embedder.text-embedding-3-large 
diff --git a/wren-ai-service/tools/config/config.full.yaml b/wren-ai-service/tools/config/config.full.yaml
index 644bf8dc3..738007312 100644
--- a/wren-ai-service/tools/config/config.full.yaml
+++ b/wren-ai-service/tools/config/config.full.yaml
@@ -141,10 +141,9 @@ pipes:
     document_store: qdrant
   - name: data_assistance
     llm: litellm_llm.gpt-4o-mini-2024-07-18
-  - name: sql_pairs_preparation
+  - name: sql_pairs_indexing
     document_store: qdrant
     embedder: openai_embedder.text-embedding-3-large
-    llm: litellm_llm.gpt-4o-mini-2024-07-18
   - name: sql_pairs_deletion
     document_store: qdrant
     embedder: openai_embedder.text-embedding-3-large 

From 73996560700046e9e2409a8c35fd004ee3b50d1a Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Wed, 8 Jan 2025 19:07:10 +0800
Subject: [PATCH 10/19] add the todo for sql pair preparation endpoint

---
 wren-ai-service/src/web/v1/services/sql_pairs_preparation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
index e90598943..49a881052 100644
--- a/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
+++ b/wren-ai-service/src/web/v1/services/sql_pairs_preparation.py
@@ -91,6 +91,7 @@ async def prepare_sql_pairs(
         }
 
         try:
+            # TODO: Implement proper SQL pairs preparation functionality. Current implementation needs to be updated.
             await self._pipelines["sql_pairs_preparation"].run(
                 sql_pairs=prepare_sql_pairs_request.sql_pairs,
                 project_id=prepare_sql_pairs_request.project_id,

From 8cbb36965018b943ef21ea66d43e5115ca100b20 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 11:04:44 +0800
Subject: [PATCH 11/19] chore: comment for deletion pipe

---
 wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py b/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py
index 42683cede..f49a79926 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs_deletion.py
@@ -26,6 +26,7 @@ async def delete_sql_pairs(
 ## End of Pipeline
 
 
+# TODO: consider removing this pipeline and using the function in the sql_pairs_indexing pipeline instead like other indexing pipelines
 class SqlPairsDeletion(BasicPipeline):
     def __init__(
         self,

From 05f1e8f63ecd39360d2e7f9b8ab6b67a78ed7b52 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 11:08:30 +0800
Subject: [PATCH 12/19] fix: test cases

---
 ...pairs_preparation.py => test_sql_pairs.py} | 34 ++++++++-----------
 .../indexing/test_sql_pairs_deletion.py       | 14 ++++----
 2 files changed, 21 insertions(+), 27 deletions(-)
 rename wren-ai-service/tests/pytest/pipelines/indexing/{test_sql_pairs_preparation.py => test_sql_pairs.py} (65%)

diff --git a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_preparation.py b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
similarity index 65%
rename from wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_preparation.py
rename to wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
index 8ef66ea66..23f819a73 100644
--- a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_preparation.py
+++ b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
@@ -2,28 +2,26 @@
 
 from src.config import settings
 from src.core.provider import DocumentStoreProvider
-from src.pipelines.indexing.sql_pairs_preparation import SqlPair, SqlPairsPreparation
+from src.pipelines.indexing.sql_pairs import SqlPair, SqlPairs
 from src.providers import generate_components
 
 
 @pytest.mark.asyncio
-async def test_sql_pairs_preparation_saving_to_document_store():
+async def test_sql_pairs_indexing_saving_to_document_store():
     pipe_components = generate_components(settings.components)
     document_store_provider: DocumentStoreProvider = pipe_components[
-        "sql_pairs_preparation"
+        "sql_pairs_indexing"
     ]["document_store_provider"]
     store = document_store_provider.get_store(
         dataset_name="sql_pairs",
         recreate_index=True,
     )
 
-    sql_pairs_preparation = SqlPairsPreparation(
-        **pipe_components["sql_pairs_preparation"]
-    )
-    await sql_pairs_preparation.run(
+    sql_pairs = SqlPairs(**pipe_components["sql_pairs_indexing"])
+    await sql_pairs.run(
         sql_pairs=[
-            SqlPair(sql="SELECT * FROM book", id="1"),
-            SqlPair(sql="SELECT * FROM author", id="2"),
+            SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
+            SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
         ],
         project_id="fake-id",
     )
@@ -41,28 +39,26 @@ async def test_sql_pairs_preparation_saving_to_document_store():
 async def test_sql_pairs_preparation_saving_to_document_store_with_multiple_project_ids():
     pipe_components = generate_components(settings.components)
     document_store_provider: DocumentStoreProvider = pipe_components[
-        "sql_pairs_preparation"
+        "sql_pairs_indexing"
     ]["document_store_provider"]
     store = document_store_provider.get_store(
         dataset_name="sql_pairs",
         recreate_index=True,
     )
 
-    sql_pairs_preparation = SqlPairsPreparation(
-        **pipe_components["sql_pairs_preparation"]
-    )
-    await sql_pairs_preparation.run(
+    sql_pairs = SqlPairs(**pipe_components["sql_pairs_indexing"])
+    await sql_pairs.run(
         sql_pairs=[
-            SqlPair(sql="SELECT * FROM book", id="1"),
-            SqlPair(sql="SELECT * FROM author", id="2"),
+            SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
+            SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
         ],
         project_id="fake-id",
     )
 
-    await sql_pairs_preparation.run(
+    await sql_pairs.run(
         sql_pairs=[
-            SqlPair(sql="SELECT * FROM book", id="1"),
-            SqlPair(sql="SELECT * FROM author", id="2"),
+            SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
+            SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
         ],
         project_id="fake-id-2",
     )
diff --git a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
index 325beea3e..b654002aa 100644
--- a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
+++ b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
@@ -2,8 +2,8 @@
 
 from src.config import settings
 from src.core.provider import DocumentStoreProvider
+from src.pipelines.indexing.sql_pairs import SqlPair, SqlPairs
 from src.pipelines.indexing.sql_pairs_deletion import SqlPairsDeletion
-from src.pipelines.indexing.sql_pairs_preparation import SqlPair, SqlPairsPreparation
 from src.providers import generate_components
 
 
@@ -11,7 +11,7 @@
 async def test_sql_pairs_deletion():
     pipe_components = generate_components(settings.components)
     document_store_provider: DocumentStoreProvider = pipe_components[
-        "sql_pairs_preparation"
+        "sql_pairs_indexing"
     ]["document_store_provider"]
     store = document_store_provider.get_store(
         dataset_name="sql_pairs",
@@ -19,13 +19,11 @@ async def test_sql_pairs_deletion():
     )
 
     sql_pairs = [
-        SqlPair(sql="SELECT * FROM book", id="1"),
-        SqlPair(sql="SELECT * FROM author", id="2"),
+        SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
+        SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
     ]
-    sql_pairs_preparation = SqlPairsPreparation(
-        **pipe_components["sql_pairs_preparation"]
-    )
-    await sql_pairs_preparation.run(
+    sql_pairs = SqlPairs(**pipe_components["sql_pairs_indexing"])
+    await sql_pairs.run(
         sql_pairs=sql_pairs,
         project_id="fake-id",
     )

From 5d87164e07b5581e81393a1a96e3baa41841d0a4 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 11:12:10 +0800
Subject: [PATCH 13/19] chore: remove unused attribute in doc

---
 wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py b/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py
index 4d86846ee..50a49b30e 100644
--- a/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py
+++ b/wren-ai-service/src/pipelines/retrieval/sql_pairs_retrieval.py
@@ -26,7 +26,6 @@ def run(self, documents: List[Document]):
         for doc in documents:
             formatted = {
                 "question": doc.content,
-                "intention": doc.meta.get("intention"),
                 "sql": doc.meta.get("sql"),
             }
             list.append(formatted)

From eb4619c01351697cc78ec71853d744c04e8cdf17 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 11:26:32 +0800
Subject: [PATCH 14/19] chore: skip some test cases for sql pair service

---
 .../tests/pytest/services/test_sql_pairs_preparation.py         | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/wren-ai-service/tests/pytest/services/test_sql_pairs_preparation.py b/wren-ai-service/tests/pytest/services/test_sql_pairs_preparation.py
index 40ef4387c..d41fccadd 100644
--- a/wren-ai-service/tests/pytest/services/test_sql_pairs_preparation.py
+++ b/wren-ai-service/tests/pytest/services/test_sql_pairs_preparation.py
@@ -47,6 +47,7 @@ def service_metadata():
 
 
 @pytest.mark.asyncio
+@pytest.mark.skip(reason="due to pipeline change, this test is not applicable anymore")
 async def test_sql_pairs_preparation(
     sql_pairs_preparation_service: SqlPairsPreparationService,
     service_metadata: dict,
@@ -93,6 +94,7 @@ async def test_sql_pairs_preparation(
 
 
 @pytest.mark.asyncio
+@pytest.mark.skip(reason="due to pipeline change, this test is not applicable anymore")
 async def test_sql_pairs_deletion(
     sql_pairs_preparation_service: SqlPairsPreparationService,
     service_metadata: dict,

From 513350d9d263e8729bca355980b7a68c8af95adc Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 11:50:45 +0800
Subject: [PATCH 15/19] fix: test cases for sql pair and deletion

---
 wren-ai-service/tests/data/pairs.json         | 14 +++++++++++
 .../pipelines/indexing/test_sql_pairs.py      | 25 ++++++++-----------
 .../indexing/test_sql_pairs_deletion.py       | 21 ++++++----------
 3 files changed, 31 insertions(+), 29 deletions(-)
 create mode 100644 wren-ai-service/tests/data/pairs.json

diff --git a/wren-ai-service/tests/data/pairs.json b/wren-ai-service/tests/data/pairs.json
new file mode 100644
index 000000000..5f5ab0967
--- /dev/null
+++ b/wren-ai-service/tests/data/pairs.json
@@ -0,0 +1,14 @@
+{
+    "test": [
+        {
+            "id": "1",
+            "question": "What is the book?",
+            "sql": "SELECT * FROM book"
+        },
+        {
+            "id": "2",
+            "question": "What is the author?",
+            "sql": "SELECT * FROM author"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
index 23f819a73..bbce89413 100644
--- a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
+++ b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
@@ -2,7 +2,7 @@
 
 from src.config import settings
 from src.core.provider import DocumentStoreProvider
-from src.pipelines.indexing.sql_pairs import SqlPair, SqlPairs
+from src.pipelines.indexing import SqlPairs
 from src.providers import generate_components
 
 
@@ -17,12 +17,11 @@ async def test_sql_pairs_indexing_saving_to_document_store():
         recreate_index=True,
     )
 
-    sql_pairs = SqlPairs(**pipe_components["sql_pairs_indexing"])
+    sql_pairs = SqlPairs(
+        **pipe_components["sql_pairs_indexing"], sql_pairs_path="tests/data/pairs.json"
+    )
     await sql_pairs.run(
-        sql_pairs=[
-            SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
-            SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
-        ],
+        mdl_str='{"models": [{"properties": {"boilerplate": "test"}}]}',
         project_id="fake-id",
     )
 
@@ -46,20 +45,16 @@ async def test_sql_pairs_preparation_saving_to_document_store_with_multiple_proj
         recreate_index=True,
     )
 
-    sql_pairs = SqlPairs(**pipe_components["sql_pairs_indexing"])
+    sql_pairs = SqlPairs(
+        **pipe_components["sql_pairs_indexing"], sql_pairs_path="tests/data/pairs.json"
+    )
     await sql_pairs.run(
-        sql_pairs=[
-            SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
-            SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
-        ],
+        mdl_str='{"models": [{"properties": {"boilerplate": "test"}}]}',
         project_id="fake-id",
     )
 
     await sql_pairs.run(
-        sql_pairs=[
-            SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
-            SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
-        ],
+        mdl_str='{"models": [{"properties": {"boilerplate": "test"}}]}',
         project_id="fake-id-2",
     )
 
diff --git a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
index b654002aa..18226ab10 100644
--- a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
+++ b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs_deletion.py
@@ -2,8 +2,7 @@
 
 from src.config import settings
 from src.core.provider import DocumentStoreProvider
-from src.pipelines.indexing.sql_pairs import SqlPair, SqlPairs
-from src.pipelines.indexing.sql_pairs_deletion import SqlPairsDeletion
+from src.pipelines.indexing import SqlPairs, SqlPairsDeletion
 from src.providers import generate_components
 
 
@@ -18,23 +17,17 @@ async def test_sql_pairs_deletion():
         recreate_index=True,
     )
 
-    sql_pairs = [
-        SqlPair(sql="SELECT * FROM book", id="1", question="What is the book?"),
-        SqlPair(sql="SELECT * FROM author", id="2", question="What is the author?"),
-    ]
-    sql_pairs = SqlPairs(**pipe_components["sql_pairs_indexing"])
+    sql_pairs = SqlPairs(
+        **pipe_components["sql_pairs_indexing"], sql_pairs_path="tests/data/pairs.json"
+    )
     await sql_pairs.run(
-        sql_pairs=sql_pairs,
+        mdl_str='{"models": [{"properties": {"boilerplate": "test"}}]}',
         project_id="fake-id",
     )
 
     sql_pairs_deletion = SqlPairsDeletion(**pipe_components["sql_pairs_deletion"])
-    await sql_pairs_deletion.run(
-        id="fake-id-2", sql_pair_ids=[sql_pair.id for sql_pair in sql_pairs]
-    )
+    await sql_pairs_deletion.run(id="fake-id-2", sql_pair_ids=["1", "2"])
     assert await store.count_documents() == 2
 
-    await sql_pairs_deletion.run(
-        id="fake-id", sql_pair_ids=[sql_pair.id for sql_pair in sql_pairs]
-    )
+    await sql_pairs_deletion.run(id="fake-id", sql_pair_ids=["1", "2"])
     assert await store.count_documents() == 0

From 923f5ce19d25b8a3a25f8b70323252e15542c08e Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 12:01:02 +0800
Subject: [PATCH 16/19] chore: mount sql pair file for each deployment

---
 deployment/kustomizations/base/cm.yaml                     | 4 ++++
 deployment/kustomizations/base/deploy-wren-ai-service.yaml | 4 ++++
 docker/docker-compose-dev.yaml                             | 2 ++
 docker/docker-compose.yaml                                 | 2 ++
 wren-ai-service/src/config.py                              | 2 +-
 5 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/deployment/kustomizations/base/cm.yaml b/deployment/kustomizations/base/cm.yaml
index 654d43c96..6f38b3df1 100644
--- a/deployment/kustomizations/base/cm.yaml
+++ b/deployment/kustomizations/base/cm.yaml
@@ -182,3 +182,7 @@ data:
       langfuse_enable: true
       logging_level: DEBUG
       development: false
+  pairs.json: |
+    {
+      "sample": []
+    }
diff --git a/deployment/kustomizations/base/deploy-wren-ai-service.yaml b/deployment/kustomizations/base/deploy-wren-ai-service.yaml
index 928bac05b..6664e98ef 100644
--- a/deployment/kustomizations/base/deploy-wren-ai-service.yaml
+++ b/deployment/kustomizations/base/deploy-wren-ai-service.yaml
@@ -63,6 +63,8 @@ spec:
                   key: LANGFUSE_SECRET_KEY
             - name: CONFIG_PATH
               value: /app/data/config.yaml
+            - name: SQL_PAIRS_PATH
+              value: /app/data/pairs.json
           ports:
             - containerPort: 5555
       volumes:
@@ -72,3 +74,5 @@ spec:
             items:
               - key: config.yaml
                 path: config.yaml
+              - key: pairs.json
+                path: pairs.json
diff --git a/docker/docker-compose-dev.yaml b/docker/docker-compose-dev.yaml
index 2faaf633e..e28bd005c 100644
--- a/docker/docker-compose-dev.yaml
+++ b/docker/docker-compose-dev.yaml
@@ -45,10 +45,12 @@ services:
       # using PYTHONUNBUFFERED: 1 can fix this
       PYTHONUNBUFFERED: 1
       CONFIG_PATH: /app/data/config.yaml
+      SQL_PAIRS_PATH: /app/data/pairs.json
     env_file:
       - ${PROJECT_DIR}/.env
     volumes:
       - ${PROJECT_DIR}/config.yaml:/app/data/config.yaml
+      - ${PROJECT_DIR}/pairs.json:/app/data/pairs.json
     networks:
       - wren
     depends_on:
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 01a354f7d..d4e9b48f8 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -56,10 +56,12 @@ services:
       # using PYTHONUNBUFFERED: 1 can fix this
       PYTHONUNBUFFERED: 1
       CONFIG_PATH: /app/data/config.yaml
+      SQL_PAIRS_PATH: /app/data/pairs.json
     env_file:
       - ${PROJECT_DIR}/.env
     volumes:
       - ${PROJECT_DIR}/config.yaml:/app/data/config.yaml
+      - ${PROJECT_DIR}/pairs.json:/app/data/pairs.json
     networks:
       - wren
     depends_on:
diff --git a/wren-ai-service/src/config.py b/wren-ai-service/src/config.py
index a41dcfc7e..991a29010 100644
--- a/wren-ai-service/src/config.py
+++ b/wren-ai-service/src/config.py
@@ -54,7 +54,7 @@ class Settings(BaseSettings):
     config_path: str = Field(default="config.yaml")
     _components: list[dict]
 
-    sql_pairs_path: str = Field(default="sql_pairs.json")
+    sql_pairs_path: str = Field(default="pairs.json")
 
     def __init__(self):
         load_dotenv(".env.dev", override=True)

From 254c180ac47d868c2049f3bb1efd054125cac051 Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 13:46:13 +0800
Subject: [PATCH 17/19] chore: deal import, type and error handling suggestion

---
 wren-ai-service/src/pipelines/indexing/sql_pairs.py | 12 ++++++++----
 wren-ai-service/tools/mdl_to_str.py                 |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs.py b/wren-ai-service/src/pipelines/indexing/sql_pairs.py
index b8616198d..c5c322d30 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs.py
@@ -2,7 +2,7 @@
 import os
 import sys
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Set
 
 import orjson
 from hamilton import base
@@ -53,7 +53,7 @@ def run(self, sql_pairs: List[SqlPair], project_id: Optional[str] = ""):
 @observe(capture_input=False)
 def boilerplates(
     mdl_str: str,
-) -> List[str]:
+) -> Set[str]:
     mdl = orjson.loads(mdl_str)
 
     return {
@@ -129,8 +129,12 @@ def _load_sql_pairs(sql_pairs_path: str) -> Dict[str, Any]:
         logger.warning(f"SQL pairs file not found: {sql_pairs_path}")
         return {}
 
-    with open(sql_pairs_path, "r") as file:
-        return orjson.loads(file.read())
+    try:
+        with open(sql_pairs_path, "r") as file:
+            return orjson.loads(file.read())
+    except Exception as e:
+        logger.error(f"Error loading SQL pairs file: {e}")
+        return {}
 
 
 class SqlPairs(BasicPipeline):
diff --git a/wren-ai-service/tools/mdl_to_str.py b/wren-ai-service/tools/mdl_to_str.py
index cfab8f8bf..d7561e738 100644
--- a/wren-ai-service/tools/mdl_to_str.py
+++ b/wren-ai-service/tools/mdl_to_str.py
@@ -1,3 +1,5 @@
+import argparse
+
 import orjson
 
 
@@ -38,8 +40,6 @@ def _args():
 
 
 if __name__ == "__main__":
-    import argparse
-
     args = _args()
     mdl = orjson.loads(open(args.path).read())
     print(to_str(mdl))

From c79f907f5e32f96926c1b569ddd32c55ab39529a Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Thu, 9 Jan 2025 13:59:06 +0800
Subject: [PATCH 18/19] chore: rename test case, and correct pipeline type

---
 wren-ai-service/src/pipelines/indexing/sql_pairs.py             | 2 +-
 .../tests/pytest/pipelines/indexing/test_sql_pairs.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/wren-ai-service/src/pipelines/indexing/sql_pairs.py b/wren-ai-service/src/pipelines/indexing/sql_pairs.py
index c5c322d30..d4bbf9136 100644
--- a/wren-ai-service/src/pipelines/indexing/sql_pairs.py
+++ b/wren-ai-service/src/pipelines/indexing/sql_pairs.py
@@ -65,7 +65,7 @@ def boilerplates(
 
 @observe(capture_input=False)
 def sql_pairs(
-    boilerplates: List[str],
+    boilerplates: Set[str],
     external_pairs: Dict[str, Any],
 ) -> List[SqlPair]:
     return [
diff --git a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
index bbce89413..f658abcf5 100644
--- a/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
+++ b/wren-ai-service/tests/pytest/pipelines/indexing/test_sql_pairs.py
@@ -35,7 +35,7 @@ async def test_sql_pairs_indexing_saving_to_document_store():
 
 
 @pytest.mark.asyncio
-async def test_sql_pairs_preparation_saving_to_document_store_with_multiple_project_ids():
+async def test_sql_pairs_indexing_saving_to_document_store_with_multiple_project_ids():
     pipe_components = generate_components(settings.components)
     document_store_provider: DocumentStoreProvider = pipe_components[
         "sql_pairs_indexing"

From 81f13bf53c2a516585a6e30bc12b001aaafde4ff Mon Sep 17 00:00:00 2001
From: Pao-Sheng Wang <paooap.oappao@gmail.com>
Date: Fri, 10 Jan 2025 11:40:57 +0800
Subject: [PATCH 19/19] chore: remove empty pairs file from each deployment

---
 deployment/kustomizations/base/cm.yaml                     | 6 +-----
 deployment/kustomizations/base/deploy-wren-ai-service.yaml | 4 ----
 docker/docker-compose-dev.yaml                             | 2 --
 docker/docker-compose.yaml                                 | 2 --
 4 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/deployment/kustomizations/base/cm.yaml b/deployment/kustomizations/base/cm.yaml
index 6f38b3df1..a9fc26f4a 100644
--- a/deployment/kustomizations/base/cm.yaml
+++ b/deployment/kustomizations/base/cm.yaml
@@ -181,8 +181,4 @@ data:
       langfuse_host: https://cloud.langfuse.com
       langfuse_enable: true
       logging_level: DEBUG
-      development: false
-  pairs.json: |
-    {
-      "sample": []
-    }
+      development: false
\ No newline at end of file
diff --git a/deployment/kustomizations/base/deploy-wren-ai-service.yaml b/deployment/kustomizations/base/deploy-wren-ai-service.yaml
index 6664e98ef..928bac05b 100644
--- a/deployment/kustomizations/base/deploy-wren-ai-service.yaml
+++ b/deployment/kustomizations/base/deploy-wren-ai-service.yaml
@@ -63,8 +63,6 @@ spec:
                   key: LANGFUSE_SECRET_KEY
             - name: CONFIG_PATH
               value: /app/data/config.yaml
-            - name: SQL_PAIRS_PATH
-              value: /app/data/pairs.json
           ports:
             - containerPort: 5555
       volumes:
@@ -74,5 +72,3 @@ spec:
             items:
               - key: config.yaml
                 path: config.yaml
-              - key: pairs.json
-                path: pairs.json
diff --git a/docker/docker-compose-dev.yaml b/docker/docker-compose-dev.yaml
index e28bd005c..2faaf633e 100644
--- a/docker/docker-compose-dev.yaml
+++ b/docker/docker-compose-dev.yaml
@@ -45,12 +45,10 @@ services:
       # using PYTHONUNBUFFERED: 1 can fix this
       PYTHONUNBUFFERED: 1
       CONFIG_PATH: /app/data/config.yaml
-      SQL_PAIRS_PATH: /app/data/pairs.json
     env_file:
       - ${PROJECT_DIR}/.env
     volumes:
       - ${PROJECT_DIR}/config.yaml:/app/data/config.yaml
-      - ${PROJECT_DIR}/pairs.json:/app/data/pairs.json
     networks:
       - wren
     depends_on:
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index d4e9b48f8..01a354f7d 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -56,12 +56,10 @@ services:
       # using PYTHONUNBUFFERED: 1 can fix this
       PYTHONUNBUFFERED: 1
       CONFIG_PATH: /app/data/config.yaml
-      SQL_PAIRS_PATH: /app/data/pairs.json
     env_file:
       - ${PROJECT_DIR}/.env
     volumes:
       - ${PROJECT_DIR}/config.yaml:/app/data/config.yaml
-      - ${PROJECT_DIR}/pairs.json:/app/data/pairs.json
     networks:
       - wren
     depends_on: