WDGPH · kassyray · Nov 14, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/input/rodent_dataset.xlsx b/input/rodent_dataset.xlsx
diff --git a/pipeline/bundle_pdfs.py b/pipeline/bundle_pdfs.py
@@ -4,11 +4,9 @@
 accompanying manifest records. It can be invoked as a CLI tool or imported for
 unit testing. Bundling supports three modes:
 
-* Size-based (default): chunk the ordered list of PDFs into groups of
-  ``bundle_size``.
-* School-based: group by ``school_code`` and then chunk each group while
-  preserving client order.
-* Board-based: group by ``board_code`` and chunk each group.
+* Size-based (default): bundle the clients into fixed-size groups, i.e., 100 per bundle.
+* School-based: group the clients by school code then bundle the clients into fixed-sized groups
+* Board-based: group the clients by board code then bundle the clients into fixed-sized groups
 
 Each bundle produces a merged PDF inside ``output/pdf_combined`` and a manifest JSON
 record inside ``output/metadata`` that captures critical metadata for audits.

diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py
@@ -227,6 +227,9 @@ def run_step_2_preprocess(
     df_filtered = preprocess.filter_columns(mapped_df)
     df = preprocess.ensure_required_columns(df_filtered)
 
+    # Check that addresses are complete, return only complete rows
+    df = preprocess.check_addresses_complete(df)
+
     # Load configuration
     vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
     vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))

diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
@@ -172,6 +172,65 @@ def format_iso_date_for_language(iso_date: str, language: str) -> str:
 
     return format_date(date_obj, format="long", locale=locale)
 
+def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Check if address fields are complete in the DataFrame.
+
+    Adds a boolean 'address_complete' column based on presence of
+    street address, city, province, and postal code.
+    """
+
+    df = df.copy()
+
+    # Normalize text fields: convert to string, strip whitespace, convert "" to NA
+    address_cols = [
+        "STREET_ADDRESS_LINE_1",
+        "STREET_ADDRESS_LINE_2",
+        "CITY",
+        "PROVINCE",
+        "POSTAL_CODE",
+    ]
+
+    for col in address_cols:
+        df[col] = (
+            df[col]
+            .astype(str)
+            .str.strip()
+            .replace({"": pd.NA, "nan": pd.NA})
+        )
+
+    # Build combined address line
+    df["ADDRESS"] = (
+        df["STREET_ADDRESS_LINE_1"].fillna("") + " " +
+        df["STREET_ADDRESS_LINE_2"].fillna("")
+    ).str.strip()
+
+    df["ADDRESS"] = df["ADDRESS"].replace({"": pd.NA})
+
+    # Check completeness
+    df["address_complete"] = (
+        df["ADDRESS"].notna()
+        & df["CITY"].notna()
+        & df["PROVINCE"].notna()
+        & df["POSTAL_CODE"].notna()
+    )
+
+    if not df["address_complete"].all():
+        incomplete_count = (~df["address_complete"]).sum()
+        LOG.warning(
+            "There are %d records with incomplete address information.",
+            incomplete_count,
+        )
+
+        incomplete_records = df.loc[~df["address_complete"]]
+
+        incomplete_path = Path("output/incomplete_addresses.csv")
+        incomplete_records.to_csv(incomplete_path, index=False)
+        LOG.info("Incomplete address records written to %s", incomplete_path)
+
+    # Return only rows with complete addresses
+    return df.loc[df["address_complete"]].drop(columns=["address_complete"])
+
 
 def convert_date_iso(date_str: str) -> str:
     """Convert a date from English display format to ISO format.