Skip to content

Commit 35b77ac

Browse files
authored
Merge pull request #91 from broadinstitute/sc_by_guide
Create normalized single_cell_profiles_by_guide output
2 parents 21b16d7 + 50455de commit 35b77ac

File tree

1 file changed

+99
-18
lines changed

1 file changed

+99
-18
lines changed

1.generate-profiles/2.normalize.py

Lines changed: 99 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,13 @@ def handle_excepthook(exc_type, exc_value, exc_traceback):
7070
normalize_input_files = config["files"]["aggregate_files"]
7171
normalize_output_files = config["files"]["normalize_files"]
7272
single_cell_file = config["files"]["single_file_only_output_file"]
73+
image_file = config["files"]["image_file"]
7374

7475
sc_config = config["options"]["profile"]["single_cell"]
7576
normalize_singlecell_from_single_file = sc_config["output_one_single_cell_file_only"]
7677

7778
normalize_args = config["options"]["profile"]["normalize"]
79+
output_single_cell_by_guide = normalize_args["output_single_cell_by_guide"]
7880
normalize_levels = normalize_args["levels"]
7981
normalize_by_samples = normalize_args["by_samples"]
8082
normalize_these_features = normalize_args["features"]
@@ -105,28 +107,107 @@ def handle_excepthook(exc_type, exc_value, exc_traceback):
105107
file_to_normalize.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"),
106108
)
107109

108-
print(
109-
f"Now normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
110-
)
111-
logging.info(
112-
f"Normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
113-
)
114-
115110
output_file = normalize_output_files[data_level]
116111
output_file = pathlib.Path(
117112
normalize_output_files[data_level].parents[0],
118113
output_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"),
119114
)
120-
df = read_csvs_with_chunksize(file_to_normalize)
121-
122-
normalize(
123-
profiles=df,
124-
features=normalize_these_features,
125-
samples=normalize_by_samples,
126-
method=normalize_method,
127-
output_file=output_file,
128-
compression_options=compression,
129-
float_format=float_format,
130-
)
115+
116+
if os.path.exists(output_file):
117+
if force:
118+
print(f"Force overwriting {output_file}")
119+
logging.info(f"Force overwriting {output_file}")
120+
print(
121+
f"Now normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
122+
)
123+
logging.info(
124+
f"Normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
125+
)
126+
df = read_csvs_with_chunksize(file_to_normalize)
127+
128+
# Don't normalize locations
129+
meta_cols=list(df.columns[df.columns.str.contains("Metadata")])
130+
remove_locs = list(filter(lambda x: "_Location_Center_X" in x or "_Location_Center_Y" in x , df.columns))
131+
remove_cents = list(filter(lambda x: "AreaShape_Center_X" in x or "AreaShape_Center_Y" in x , df.columns))
132+
meta_cols = meta_cols + remove_locs + remove_cents
133+
134+
normalize(
135+
profiles=df,
136+
features=normalize_these_features,
137+
meta_features=meta_cols,
138+
samples=normalize_by_samples,
139+
method=normalize_method,
140+
output_file=output_file,
141+
compression_options=compression,
142+
float_format=float_format,
143+
)
144+
else:
145+
print(
146+
f"Now normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
147+
)
148+
logging.info(
149+
f"Normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
150+
)
151+
df = read_csvs_with_chunksize(file_to_normalize)
152+
153+
# Don't normalize locations
154+
meta_cols=list(df.columns[df.columns.str.contains("Metadata")])
155+
remove_locs = list(filter(lambda x: "_Location_Center_X" in x or "_Location_Center_Y" in x , df.columns))
156+
remove_cents = list(filter(lambda x: "AreaShape_Center_X" in x or "AreaShape_Center_Y" in x , df.columns))
157+
meta_cols = meta_cols + remove_locs + remove_cents
158+
159+
normalize(
160+
profiles=df,
161+
features=normalize_these_features,
162+
meta_features=meta_cols,
163+
samples=normalize_by_samples,
164+
method=normalize_method,
165+
output_file=output_file,
166+
compression_options=compression,
167+
float_format=float_format,
168+
)
169+
170+
if data_level == "single_cell":
171+
if output_single_cell_by_guide:
172+
print(
173+
f"Now outputting normalized single cell profiles by guide for split {data_split_site}"
174+
)
175+
logging.info(
176+
f"Now outputting normalized single cell profiles by guide for split {data_split_site}"
177+
)
178+
# Load image alignment information for appending to single_cell_by_guide csvs
179+
image_df = pd.read_csv(image_file, sep="\t")
180+
keep_columns = []
181+
for col in image_df.columns:
182+
if "Align_" in col:
183+
keep_columns.append(col)
184+
keep_columns.append("Metadata_site")
185+
image_df = image_df.loc[:, keep_columns]
186+
187+
sc_by_guide_folder = os.path.join(
188+
single_cell_input_dir, "single_cell_by_guide"
189+
)
190+
if not os.path.isdir(sc_by_guide_folder):
191+
os.mkdir(sc_by_guide_folder)
192+
df = read_csvs_with_chunksize(output_file)
193+
for guide in set(df["Metadata_Foci_Barcode_MatchedTo_Barcode"]):
194+
gene = df[df["Metadata_Foci_Barcode_MatchedTo_Barcode"] == guide][
195+
"Metadata_Foci_Barcode_MatchedTo_GeneCode"
196+
].tolist()[0]
197+
guide_file_name = f"{str(output_file).split('__')[0].split('/')[-1]}__{guide}_{gene}.csv.gz"
198+
guide_path = os.path.join(sc_by_guide_folder, guide_file_name)
199+
if not os.path.exists(guide_path):
200+
guide_df = pd.DataFrame()
201+
else:
202+
guide_df = read_csvs_with_chunksize(guide_path)
203+
append_df = df.loc[
204+
df["Metadata_Foci_Barcode_MatchedTo_Barcode"] == guide
205+
]
206+
append_df = append_df.merge(
207+
image_df, left_on="Metadata_Foci_site", right_on="Metadata_site"
208+
)
209+
guide_df = guide_df.append(append_df)
210+
guide_df.to_csv(guide_path, index=False)
211+
131212
print("Finished 2.normalize.")
132213
logging.info("Finished 2.normalize.")

0 commit comments

Comments
 (0)