@@ -86,6 +86,7 @@ def handle_excepthook(exc_type, exc_value, exc_traceback):
8686single_file_only = sc_config ["output_one_single_cell_file_only" ]
8787force = sc_config ["force_overwrite" ]
8888perform = sc_config ["perform" ]
89+ allowed_skips = sc_config ["allowed_skips" ]
8990
9091gene_col = config ["options" ]["profile" ]["aggregate" ]["levels" ]["gene" ]
9192
@@ -122,107 +123,128 @@ def handle_excepthook(exc_type, exc_value, exc_traceback):
122123 config ["experiment" ], sites , split_info , separator = "___"
123124)
124125
126+ allowed_skip_counter = 0
125127for data_split_site in site_info_dict :
126128 split_sites = site_info_dict [data_split_site ]
127129
128130 sc_df = []
129131 for site in split_sites :
130- # Define single cell output directory and files
131- site_output_dir = pathlib .Path (single_cell_output_dir , site )
132- site_output_dir .mkdir (parents = True , exist_ok = True )
133- sc_output_file = pathlib .Path (site_output_dir , f"{ site } _single_cell.csv.gz" )
134-
135- # Define options based on input flags
136- if single_file_only :
137- print (
138- f"Building single file for dataset { data_split_site } ; combining single cells from site: { site } ..."
139- )
140- logging .info (
141- f"Building single file for dataset { data_split_site } ; combining single cells from site: { site } ..."
142- )
143- else :
144- # If the output file already exists, only overwrite if --force is provided
145- if sc_output_file .exists ():
146- if not force :
147- print (
148- f"Skipping reprocessing single cells for site: { site } ... use --force to overwrite"
149- )
150- logging .info (f"Skipped reprocessing single cells for site: { site } " )
151- continue
152- else :
153- print (f"Now overwriting single cells for site: { site } ..." )
154- logging .info (f"Overwrote single cells for site: { site } " )
132+ if allowed_skips >= allowed_skip_counter :
133+ # Define single cell output directory and files
134+ site_output_dir = pathlib .Path (single_cell_output_dir , site )
135+ site_output_dir .mkdir (parents = True , exist_ok = True )
136+ sc_output_file = pathlib .Path (site_output_dir , f"{ site } _single_cell.csv.gz" )
137+
138+ # Define options based on input flags
139+ if single_file_only :
140+ print (
141+ f"Building single file for dataset { data_split_site } ; combining single cells from site: { site } ..."
142+ )
143+ logging .info (
144+ f"Building single file for dataset { data_split_site } ; combining single cells from site: { site } ..."
145+ )
155146 else :
156- print (f"Now processing single cells for site: { site } ..." )
157- logging .info (f"Processed single cells for site: { site } " )
158-
159- # Point to appropriate directories
160- site_metadata_dir = pathlib .Path (input_paintdir , site )
161- site_compartment_dir = pathlib .Path (input_batchdir , site )
162-
163- # Load cell metadata after cell quality determined in 0.preprocess-sites
164- metadata_file = pathlib .Path (site_metadata_dir , f"metadata_{ site } .tsv.gz" )
165- try :
166- metadata_df = read_csvs_with_chunksize (metadata_file , sep = "\t " ).query (
167- f"{ cell_quality_col } in @cell_filter"
168- )
169- except :
170- print (f"Error loading metadata file for { site } . Skipping." )
171- logging .info (f"Error loading metadata file for { site } . Skipping." )
172- continue
173-
174- if sanitize_genes :
175- metadata_df = sanitize_gene_col (metadata_df , gene_col , control_barcodes )
176- if len (metadata_df ) == 0 :
177- continue
147+ # If the output file already exists, only overwrite if --force is provided
148+ if sc_output_file .exists ():
149+ if not force :
150+ print (
151+ f"Skipping reprocessing single cells for site: { site } ... use --force to overwrite"
152+ )
153+ logging .info (f"Skipped reprocessing single cells for site: { site } " )
154+ continue
155+ else :
156+ print (f"Now overwriting single cells for site: { site } ..." )
157+ logging .info (f"Overwrote single cells for site: { site } " )
158+ else :
159+ print (f"Now processing single cells for site: { site } ..." )
160+ logging .info (f"Processed single cells for site: { site } " )
178161
179- # Load csv files for prespecified compartments
180- compartment_csvs = {}
181- for compartment in compartments :
182- try :
183- metadata_cols = parent_col_info [compartment .lower ()] + id_cols
184- except KeyError :
185- metadata_cols = id_cols
162+ # Point to appropriate directories
163+ site_metadata_dir = pathlib .Path (input_paintdir , site )
164+ site_compartment_dir = pathlib .Path (input_batchdir , site )
165+
166+ # Load cell metadata after cell quality determined in 0.preprocess-sites
167+ metadata_file = pathlib .Path (site_metadata_dir , f"metadata_{ site } .tsv.gz" )
186168 try :
187- compartment_csvs [ compartment ] = load_single_cell_compartment_csv (
188- site_compartment_dir , compartment , metadata_cols
169+ metadata_df = read_csvs_with_chunksize ( metadata_file , sep = " \t " ). query (
170+ f" { cell_quality_col } in @cell_filter"
189171 )
190- except FileNotFoundError :
172+ except :
173+ print (f"Error loading metadata file for { site } . Skipping." )
174+ logging .info (f"Error loading metadata file for { site } . Skipping." )
175+ allowed_skip_counter += 1
176+ print (f"Now at { allowed_skip_counter } sites skipped from errors." )
177+ logging .warning (f"Now at { allowed_skip_counter } sites skipped from errors." )
191178 continue
192179
193- if len (compartment_csvs ) != len (compartments ):
194- warnings .warn (
195- f"Not all compartments are present in site: { site } \n Check CellProfiler output path: { site_compartment_dir } . Skipping this site."
180+ if sanitize_genes :
181+ try :
182+ metadata_df = sanitize_gene_col (metadata_df , gene_col , control_barcodes )
183+ if len (metadata_df ) == 0 :
184+ print (f"Metadata file empty for { site } . Skipping." )
185+ logging .info (f"Metadata file empty for { site } . Skipping." )
186+ allowed_skip_counter += 1
187+ print (f"Now at { allowed_skip_counter } sites skipped from errors." )
188+ logging .warning (f"Now at { allowed_skip_counter } sites skipped from errors." )
189+ continue
190+ except :
191+ print (f"Sanitizing genes failed for { site } . Skipping." )
192+ logging .info (f"Sanitizing genes failed for { site } . Skipping." )
193+ allowed_skip_counter += 1
194+ print (f"Now at { allowed_skip_counter } sites skipped from errors." )
195+ logging .warning (f"Now at { allowed_skip_counter } sites skipped from errors." )
196+ continue
197+
198+ # Load csv files for prespecified compartments
199+ compartment_csvs = {}
200+ for compartment in compartments :
201+ try :
202+ metadata_cols = parent_col_info [compartment .lower ()] + id_cols
203+ except KeyError :
204+ metadata_cols = id_cols
205+ try :
206+ compartment_csvs [compartment ] = load_single_cell_compartment_csv (
207+ site_compartment_dir , compartment , metadata_cols
208+ )
209+ except FileNotFoundError :
210+ continue
211+
212+ if len (compartment_csvs ) != len (compartments ):
213+ warnings .warn (
214+ f"Not all compartments are present in site: { site } \n Check CellProfiler output path: { site_compartment_dir } . Skipping this site."
215+ )
216+ logging .warning (
217+ f"{ site } skipped because of missing compartments in { site_compartment_dir } ."
218+ )
219+ allowed_skip_counter += 1
220+ print (f"Now at { allowed_skip_counter } sites skipped from errors." )
221+ logging .warning (f"Now at { allowed_skip_counter } sites skipped from errors." )
222+ continue
223+
224+ # Merge single cell compartments together
225+ sc_merged_df = merge_single_cell_compartments (
226+ compartment_csvs , merge_info , id_cols
196227 )
197- logging .warning (
198- f"{ site } skipped because of missing compartments in { site_compartment_dir } ."
228+ sc_merged_df = sc_merged_df .assign (Metadata_Foci_site = site ).reindex (
229+ ["Metadata_Foci_site" ] + all_feature_df .feature_name .tolist (),
230+ axis = "columns" ,
199231 )
200- continue
201232
202- # Merge single cell compartments together
203- sc_merged_df = merge_single_cell_compartments (
204- compartment_csvs , merge_info , id_cols
205- )
206- sc_merged_df = sc_merged_df .assign (Metadata_Foci_site = site ).reindex (
207- ["Metadata_Foci_site" ] + all_feature_df .feature_name .tolist (),
208- axis = "columns" ,
209- )
233+ # Merge single cell profiles and metadata
234+ sc_merged_df = metadata_df .merge (
235+ sc_merged_df , on = merge_info ["metadata_linking_columns" ], how = "left"
236+ ).reset_index (drop = True )
210237
211- # Merge single cell profiles and metadata
212- sc_merged_df = metadata_df .merge (
213- sc_merged_df , on = merge_info ["metadata_linking_columns" ], how = "left"
214- ).reset_index (drop = True )
215-
216- if single_file_only :
217- sc_df .append (sc_merged_df )
218- else :
219- sc_merged_df .to_csv (
220- sc_output_file ,
221- sep = "," ,
222- index = False ,
223- compression = compression ,
224- float_format = float_format ,
225- )
238+ if single_file_only :
239+ sc_df .append (sc_merged_df )
240+ else :
241+ sc_merged_df .to_csv (
242+ sc_output_file ,
243+ sep = "," ,
244+ index = False ,
245+ compression = compression ,
246+ float_format = float_format ,
247+ )
226248
227249 if single_file_only :
228250 # Define a dataset specific file
0 commit comments