taesiri commited on
Commit
a3b292d
·
1 Parent(s): 602a574
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -262,24 +262,29 @@ def get_statistics():
262
  if not data_dir.exists():
263
  return "No data directory found"
264
 
265
- # Count folders with metadata.json
266
- total_posts = 0
267
  posts_with_responses = 0
268
  total_responses = 0
269
  responses_per_post = [] # List to track number of responses for each post
270
 
271
  for metadata_file in data_dir.glob("*/metadata.json"):
272
- total_posts += 1
273
- try:
274
- with open(metadata_file, "r") as f:
275
- metadata = json.load(f)
276
- num_responses = len(metadata.get("responses", []))
277
- responses_per_post.append(num_responses)
278
- if num_responses > 0:
279
- posts_with_responses += 1
280
- total_responses += num_responses
281
- except:
282
- continue
 
 
 
 
 
283
 
284
  # Calculate additional statistics
285
  if responses_per_post:
@@ -295,17 +300,21 @@ def get_statistics():
295
  stats = f"""
296
  📊 Collection Statistics:
297
 
298
- Overall Progress:
299
- - Total Posts Processed: {total_posts}
 
 
 
 
 
300
  - Posts with Responses: {posts_with_responses}
 
301
  - Total Individual Responses: {total_responses}
302
- - Completion Rate: {(posts_with_responses/len(VALID_DATASET_POST_IDS)*100):.2f}%
303
 
304
  Response Distribution:
305
  - Median Responses per Post: {median_responses}
306
  - Average Responses per Post: {avg_responses:.2f}
307
  - Maximum Responses for a Post: {max_responses}
308
- - Posts with No Responses: {total_posts - posts_with_responses}
309
  """
310
  return stats
311
 
 
262
  if not data_dir.exists():
263
  return "No data directory found"
264
 
265
+ total_expected_posts = len(VALID_DATASET_POST_IDS)
266
+ processed_post_ids = set()
267
  posts_with_responses = 0
268
  total_responses = 0
269
  responses_per_post = [] # List to track number of responses for each post
270
 
271
  for metadata_file in data_dir.glob("*/metadata.json"):
272
+ post_id = metadata_file.parent.name
273
+ if post_id in VALID_DATASET_POST_IDS: # Only count valid posts
274
+ processed_post_ids.add(post_id)
275
+ try:
276
+ with open(metadata_file, "r") as f:
277
+ metadata = json.load(f)
278
+ num_responses = len(metadata.get("responses", []))
279
+ responses_per_post.append(num_responses)
280
+ if num_responses > 0:
281
+ posts_with_responses += 1
282
+ total_responses += num_responses
283
+ except:
284
+ continue
285
+
286
+ missing_posts = set(map(str, VALID_DATASET_POST_IDS)) - processed_post_ids
287
+ total_processed = len(processed_post_ids)
288
 
289
  # Calculate additional statistics
290
  if responses_per_post:
 
300
  stats = f"""
301
  📊 Collection Statistics:
302
 
303
+ Dataset Coverage:
304
+ - Total Expected Posts: {total_expected_posts}
305
+ - Posts Processed: {total_processed}
306
+ - Missing Posts: {len(missing_posts)} ({', '.join(list(missing_posts)[:5])}{'...' if len(missing_posts) > 5 else ''})
307
+ - Coverage Rate: {(total_processed/total_expected_posts*100):.2f}%
308
+
309
+ Response Statistics:
310
  - Posts with Responses: {posts_with_responses}
311
+ - Posts without Responses: {total_processed - posts_with_responses}
312
  - Total Individual Responses: {total_responses}
 
313
 
314
  Response Distribution:
315
  - Median Responses per Post: {median_responses}
316
  - Average Responses per Post: {avg_responses:.2f}
317
  - Maximum Responses for a Post: {max_responses}
 
318
  """
319
  return stats
320