makes dummy ped file creator CLI compatible

f3644cbd · Manavalan Gajapathy · 0f0ef52e · f3644cbd
Commit f3644cbd authored 4 years ago by Manavalan Gajapathy
--- a/src/create_dummy_ped.py
+++ b/src/create_dummy_ped.py
@@ -2,17 +2,24 @@
 Create dummy ped file by project

 Usage:
+# setup environment
 ml reset
 ml Anaconda3
 conda activate quac_common
-python src/create_dummy_ped.py
+
+# Example
+python src/create_dummy_ped.py --project_path "/data/project/worthey_lab/projects/CF_CFF_PFarrell/" --outfile test.ped
 """

 from pathlib import Path
 import pandas as pd
+import fire


 def read_project_tracker(project_tracker_f):
+    """
+    Reads project tracking excel file. Expects certain columns to be present.
+    """

    df = pd.read_excel(project_tracker_f, usecols=["CGDS ID", "Sex"])

@@ -24,70 +31,51 @@ def read_project_tracker(project_tracker_f):
    return sample_sex_dict


-def nbbbb():
+def main(project_path, outfile, tracking_sheet=False):
+    """
+    Creates dummy pedigree file for the project requested
+
+    Args:
+        project_path (str): Project path. Script will look for samples under its subdirectory "analysis".
+        outfile (str): Output pedigree file path
+        tracking_sheet (str, optional): Project tracking sheet in excel format. Uses this for sex info. Defaults to False.
+    """

-    project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis"
+    # get sample's sex info from project tracking sheet, if supplied
+    if tracking_sheet:
+        sample_sex_dict = read_project_tracker(tracking_sheet)
+
+    # get samples from cheaha for the project
+    project_path = Path(project_path) / "analysis"
    samples = (
        f.name for f in project_path.iterdir() if f.is_dir() and f.name.startswith(("LW", "UDN"))
    )

    header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"]
-    with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle:
+    with open(outfile, "w") as out_handle:
        out_handle.write("\t".join(header) + "\n")

        for sample in sorted(samples):
-            data = ["unknown", sample, "-9", "-9", "-9", "-9"]
+            data = [
+                "unknown",
+                sample,
+                "-9",  # father
+                "-9",  # mother
+                sample_sex_dict[sample] if tracking_sheet else "-9",  # sample sex
+                "-9",  # affected
+            ]
            out_handle.write("\t".join(data) + "\n")

    return None


-def main(outpath):
-
-    project_dict = {
-        "CF_CFF_PFarrell": {
-            "tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx",
-            "affected": "all",
-        },
-        "CF_TLOAF_PFarrell": {
-            "tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx",
-            "affected": "all",
-        },
-        # "EDS3_unkn_DGreenspan",
-        # "MuscDyst_SU_MAlexander",
-        # "UDN_Phase1_EAWorthey",
-    }
-
-    for project_name in project_dict:
-        # get sample's sex info from project tracking sheet
-        sample_sex_dict = read_project_tracker(project_dict[project_name]["tracking_sheet"])
-
-        # get samples from cheaha for the project
-        project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis"
-        samples = (
-            f.name
-            for f in project_path.iterdir()
-            if f.is_dir() and f.name.startswith(("LW", "UDN"))
-        )
-
-        header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"]
-        with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle:
-            out_handle.write("\t".join(header) + "\n")
-
-            for sample in sorted(samples):
-                data = [
-                    "unknown",
-                    sample,
-                    "-9",  # father
-                    "-9",  # mother
-                    sample_sex_dict[sample],  # sample sex
-                    "1" if project_dict[project_name]["affected"] == "all" else "-9",  # affected
-                ]
-                out_handle.write("\t".join(data) + "\n")
-
-    return None
-
-
 if __name__ == "__main__":
-    OUT_PATH = "data/raw/ped"  # not so raw, is it?
-    main(OUT_PATH)
+    FIRE_MODE = True
+    # FIRE_MODE = False
+
+    if FIRE_MODE:
+        fire.Fire(main)
+    else:
+        PROJECT_PATH = "/data/project/worthey_lab/projects/CF_CFF_PFarrell/"
+        OUTFILE = "out.ped"
+        main(PROJECT_PATH, OUTFILE)