Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified input/rodent_dataset.xlsx
Binary file not shown.
8 changes: 3 additions & 5 deletions pipeline/bundle_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
accompanying manifest records. It can be invoked as a CLI tool or imported for
unit testing. Bundling supports three modes:

* Size-based (default): chunk the ordered list of PDFs into groups of
``bundle_size``.
* School-based: group by ``school_code`` and then chunk each group while
preserving client order.
* Board-based: group by ``board_code`` and chunk each group.
* Size-based (default): bundle the clients into fixed-size groups, i.e., 100 per bundle.
* School-based: group the clients by school code then bundle the clients into fixed-sized groups
* Board-based: group the clients by board code then bundle the clients into fixed-sized groups

Each bundle produces a merged PDF inside ``output/pdf_combined`` and a manifest JSON
record inside ``output/metadata`` that captures critical metadata for audits.
Expand Down
3 changes: 3 additions & 0 deletions pipeline/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ def run_step_2_preprocess(
df_filtered = preprocess.filter_columns(mapped_df)
df = preprocess.ensure_required_columns(df_filtered)

# Check that addresses are complete, return only complete rows
df = preprocess.check_addresses_complete(df)

# Load configuration
vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))
Expand Down
59 changes: 59 additions & 0 deletions pipeline/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,65 @@ def format_iso_date_for_language(iso_date: str, language: str) -> str:

return format_date(date_obj, format="long", locale=locale)

def check_addresses_complete(df: pd.DataFrame) -> pd.DataFrame:
"""
Check if address fields are complete in the DataFrame.

Adds a boolean 'address_complete' column based on presence of
street address, city, province, and postal code.
"""

df = df.copy()

# Normalize text fields: convert to string, strip whitespace, convert "" to NA
address_cols = [
"STREET_ADDRESS_LINE_1",
"STREET_ADDRESS_LINE_2",
"CITY",
"PROVINCE",
"POSTAL_CODE",
]

for col in address_cols:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't foresee any type errors if a filed is NAN loop looks good. Everything else looks great. [https://www.geeksforgeeks.org/python/python-pandas-dataframe-astype/]

df[col] = (
df[col]
.astype(str)
.str.strip()
.replace({"": pd.NA, "nan": pd.NA})
)

# Build combined address line
df["ADDRESS"] = (
df["STREET_ADDRESS_LINE_1"].fillna("") + " " +
df["STREET_ADDRESS_LINE_2"].fillna("")
).str.strip()

df["ADDRESS"] = df["ADDRESS"].replace({"": pd.NA})

# Check completeness
df["address_complete"] = (
df["ADDRESS"].notna()
& df["CITY"].notna()
& df["PROVINCE"].notna()
& df["POSTAL_CODE"].notna()
)

if not df["address_complete"].all():
incomplete_count = (~df["address_complete"]).sum()
LOG.warning(
"There are %d records with incomplete address information.",
incomplete_count,
)

incomplete_records = df.loc[~df["address_complete"]]

incomplete_path = Path("output/incomplete_addresses.csv")
incomplete_records.to_csv(incomplete_path, index=False)
LOG.info("Incomplete address records written to %s", incomplete_path)

# Return only rows with complete addresses
return df.loc[df["address_complete"]].drop(columns=["address_complete"])


def convert_date_iso(date_str: str) -> str:
"""Convert a date from English display format to ISO format.
Expand Down