Skip to content

Commit 0b5cadb

Browse files
authored
Merge pull request #90 from broadinstitute/skip_counter
add skip counter to 0.merge-single-cells
2 parents 879f89b + 3f24b16 commit 0b5cadb

File tree

1 file changed

+108
-86
lines changed

1 file changed

+108
-86
lines changed

1.generate-profiles/0.merge-single-cells.py

Lines changed: 108 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def handle_excepthook(exc_type, exc_value, exc_traceback):
8686
single_file_only = sc_config["output_one_single_cell_file_only"]
8787
force = sc_config["force_overwrite"]
8888
perform = sc_config["perform"]
89+
allowed_skips = sc_config["allowed_skips"]
8990

9091
gene_col = config["options"]["profile"]["aggregate"]["levels"]["gene"]
9192

@@ -122,107 +123,128 @@ def handle_excepthook(exc_type, exc_value, exc_traceback):
122123
config["experiment"], sites, split_info, separator="___"
123124
)
124125

126+
allowed_skip_counter = 0
125127
for data_split_site in site_info_dict:
126128
split_sites = site_info_dict[data_split_site]
127129

128130
sc_df = []
129131
for site in split_sites:
130-
# Define single cell output directory and files
131-
site_output_dir = pathlib.Path(single_cell_output_dir, site)
132-
site_output_dir.mkdir(parents=True, exist_ok=True)
133-
sc_output_file = pathlib.Path(site_output_dir, f"{site}_single_cell.csv.gz")
134-
135-
# Define options based on input flags
136-
if single_file_only:
137-
print(
138-
f"Building single file for dataset {data_split_site}; combining single cells from site: {site}..."
139-
)
140-
logging.info(
141-
f"Building single file for dataset {data_split_site}; combining single cells from site: {site}..."
142-
)
143-
else:
144-
# If the output file already exists, only overwrite if --force is provided
145-
if sc_output_file.exists():
146-
if not force:
147-
print(
148-
f"Skipping reprocessing single cells for site: {site}... use --force to overwrite"
149-
)
150-
logging.info(f"Skipped reprocessing single cells for site: {site}")
151-
continue
152-
else:
153-
print(f"Now overwriting single cells for site: {site}...")
154-
logging.info(f"Overwrote single cells for site: {site}")
132+
if allowed_skips >= allowed_skip_counter:
133+
# Define single cell output directory and files
134+
site_output_dir = pathlib.Path(single_cell_output_dir, site)
135+
site_output_dir.mkdir(parents=True, exist_ok=True)
136+
sc_output_file = pathlib.Path(site_output_dir, f"{site}_single_cell.csv.gz")
137+
138+
# Define options based on input flags
139+
if single_file_only:
140+
print(
141+
f"Building single file for dataset {data_split_site}; combining single cells from site: {site}..."
142+
)
143+
logging.info(
144+
f"Building single file for dataset {data_split_site}; combining single cells from site: {site}..."
145+
)
155146
else:
156-
print(f"Now processing single cells for site: {site}...")
157-
logging.info(f"Processed single cells for site: {site}")
158-
159-
# Point to appropriate directories
160-
site_metadata_dir = pathlib.Path(input_paintdir, site)
161-
site_compartment_dir = pathlib.Path(input_batchdir, site)
162-
163-
# Load cell metadata after cell quality determined in 0.preprocess-sites
164-
metadata_file = pathlib.Path(site_metadata_dir, f"metadata_{site}.tsv.gz")
165-
try:
166-
metadata_df = read_csvs_with_chunksize(metadata_file, sep="\t").query(
167-
f"{cell_quality_col} in @cell_filter"
168-
)
169-
except:
170-
print(f"Error loading metadata file for {site}. Skipping.")
171-
logging.info(f"Error loading metadata file for {site}. Skipping.")
172-
continue
173-
174-
if sanitize_genes:
175-
metadata_df = sanitize_gene_col(metadata_df, gene_col, control_barcodes)
176-
if len(metadata_df) == 0:
177-
continue
147+
# If the output file already exists, only overwrite if --force is provided
148+
if sc_output_file.exists():
149+
if not force:
150+
print(
151+
f"Skipping reprocessing single cells for site: {site}... use --force to overwrite"
152+
)
153+
logging.info(f"Skipped reprocessing single cells for site: {site}")
154+
continue
155+
else:
156+
print(f"Now overwriting single cells for site: {site}...")
157+
logging.info(f"Overwrote single cells for site: {site}")
158+
else:
159+
print(f"Now processing single cells for site: {site}...")
160+
logging.info(f"Processed single cells for site: {site}")
178161

179-
# Load csv files for prespecified compartments
180-
compartment_csvs = {}
181-
for compartment in compartments:
182-
try:
183-
metadata_cols = parent_col_info[compartment.lower()] + id_cols
184-
except KeyError:
185-
metadata_cols = id_cols
162+
# Point to appropriate directories
163+
site_metadata_dir = pathlib.Path(input_paintdir, site)
164+
site_compartment_dir = pathlib.Path(input_batchdir, site)
165+
166+
# Load cell metadata after cell quality determined in 0.preprocess-sites
167+
metadata_file = pathlib.Path(site_metadata_dir, f"metadata_{site}.tsv.gz")
186168
try:
187-
compartment_csvs[compartment] = load_single_cell_compartment_csv(
188-
site_compartment_dir, compartment, metadata_cols
169+
metadata_df = read_csvs_with_chunksize(metadata_file, sep="\t").query(
170+
f"{cell_quality_col} in @cell_filter"
189171
)
190-
except FileNotFoundError:
172+
except:
173+
print(f"Error loading metadata file for {site}. Skipping.")
174+
logging.info(f"Error loading metadata file for {site}. Skipping.")
175+
allowed_skip_counter += 1
176+
print(f"Now at {allowed_skip_counter} sites skipped from errors.")
177+
logging.warning(f"Now at {allowed_skip_counter} sites skipped from errors.")
191178
continue
192179

193-
if len(compartment_csvs) != len(compartments):
194-
warnings.warn(
195-
f"Not all compartments are present in site: {site}\nCheck CellProfiler output path: {site_compartment_dir}. Skipping this site."
180+
if sanitize_genes:
181+
try:
182+
metadata_df = sanitize_gene_col(metadata_df, gene_col, control_barcodes)
183+
if len(metadata_df) == 0:
184+
print(f"Metadata file empty for {site}. Skipping.")
185+
logging.info(f"Metadata file empty for {site}. Skipping.")
186+
allowed_skip_counter += 1
187+
print(f"Now at {allowed_skip_counter} sites skipped from errors.")
188+
logging.warning(f"Now at {allowed_skip_counter} sites skipped from errors.")
189+
continue
190+
except:
191+
print(f"Sanitizing genes failed for {site}. Skipping.")
192+
logging.info(f"Sanitizing genes failed for {site}. Skipping.")
193+
allowed_skip_counter += 1
194+
print(f"Now at {allowed_skip_counter} sites skipped from errors.")
195+
logging.warning(f"Now at {allowed_skip_counter} sites skipped from errors.")
196+
continue
197+
198+
# Load csv files for prespecified compartments
199+
compartment_csvs = {}
200+
for compartment in compartments:
201+
try:
202+
metadata_cols = parent_col_info[compartment.lower()] + id_cols
203+
except KeyError:
204+
metadata_cols = id_cols
205+
try:
206+
compartment_csvs[compartment] = load_single_cell_compartment_csv(
207+
site_compartment_dir, compartment, metadata_cols
208+
)
209+
except FileNotFoundError:
210+
continue
211+
212+
if len(compartment_csvs) != len(compartments):
213+
warnings.warn(
214+
f"Not all compartments are present in site: {site}\nCheck CellProfiler output path: {site_compartment_dir}. Skipping this site."
215+
)
216+
logging.warning(
217+
f"{site} skipped because of missing compartments in {site_compartment_dir}."
218+
)
219+
allowed_skip_counter += 1
220+
print(f"Now at {allowed_skip_counter} sites skipped from errors.")
221+
logging.warning(f"Now at {allowed_skip_counter} sites skipped from errors.")
222+
continue
223+
224+
# Merge single cell compartments together
225+
sc_merged_df = merge_single_cell_compartments(
226+
compartment_csvs, merge_info, id_cols
196227
)
197-
logging.warning(
198-
f"{site} skipped because of missing compartments in {site_compartment_dir}."
228+
sc_merged_df = sc_merged_df.assign(Metadata_Foci_site=site).reindex(
229+
["Metadata_Foci_site"] + all_feature_df.feature_name.tolist(),
230+
axis="columns",
199231
)
200-
continue
201232

202-
# Merge single cell compartments together
203-
sc_merged_df = merge_single_cell_compartments(
204-
compartment_csvs, merge_info, id_cols
205-
)
206-
sc_merged_df = sc_merged_df.assign(Metadata_Foci_site=site).reindex(
207-
["Metadata_Foci_site"] + all_feature_df.feature_name.tolist(),
208-
axis="columns",
209-
)
233+
# Merge single cell profiles and metadata
234+
sc_merged_df = metadata_df.merge(
235+
sc_merged_df, on=merge_info["metadata_linking_columns"], how="left"
236+
).reset_index(drop=True)
210237

211-
# Merge single cell profiles and metadata
212-
sc_merged_df = metadata_df.merge(
213-
sc_merged_df, on=merge_info["metadata_linking_columns"], how="left"
214-
).reset_index(drop=True)
215-
216-
if single_file_only:
217-
sc_df.append(sc_merged_df)
218-
else:
219-
sc_merged_df.to_csv(
220-
sc_output_file,
221-
sep=",",
222-
index=False,
223-
compression=compression,
224-
float_format=float_format,
225-
)
238+
if single_file_only:
239+
sc_df.append(sc_merged_df)
240+
else:
241+
sc_merged_df.to_csv(
242+
sc_output_file,
243+
sep=",",
244+
index=False,
245+
compression=compression,
246+
float_format=float_format,
247+
)
226248

227249
if single_file_only:
228250
# Define a dataset specific file

0 commit comments

Comments
 (0)