potluck.meta
Routines for checking whether specifications are correctly implemented and working as intended.
meta.py
1""" 2Routines for checking whether specifications are correctly implemented 3and working as intended. 4 5meta.py 6""" 7 8import re 9 10from . import file_utils 11 12 13EXPECTATIONS = {} 14""" 15Global storage for expectations by spec module name, mode, and username. 16Entries are module names with mode dictionaries as values, whose keys are 17modes (i.e., "evaluation" or "validation") and whose values are 18dictionaries that map usernames to lists of expectations. 19""" 20 21CURRENT_EXAMPLE = None 22""" 23Which username are expectations automatically registered for? 24""" 25 26 27def simplify(description): 28 """ 29 Normalizes case and removes HTML tags from the given goal 30 description, for use in expectation matching. Note that angle 31 brackets which aren't used for HTML tags are assumed to already be 32 escaped. Adds '^^^' at the start and '$$$' at the end so that rules 33 can use those anchors (or part of them) for disambiguation. 34 """ 35 stripped = re.sub(r"<[^>]*>", '', description) 36 return '^^^' + stripped.casefold() + '$$$' 37 38 39def all_row_trails(report_or_row, trail_prefix=None): 40 """ 41 Visits each row & sub-row of a report (or a report row) one by one. 42 For each row visited, it yields a tuple containing that row, followed 43 by a trail: a list of the first description entry of each ancestor of 44 that row, starting from the top-level ancestor and going down and 45 including that row. 46 47 If provided the given trail prefix (a list of strings) will be 48 included before the start of each trail. 49 """ 50 if trail_prefix is None: 51 trail_prefix = [] 52 53 if "table" in report_or_row: 54 for entry in report_or_row["table"]: 55 yield from all_row_trails(entry, trail_prefix) 56 else: 57 desc = report_or_row["description"][0] 58 below = trail_prefix + [ desc ] 59 if "subtable" in report_or_row: 60 yield (report_or_row, below) 61 for entry in report_or_row["subtable"]: 62 yield from all_row_trails(entry, below) 63 else: 64 yield (report_or_row, below) 65 66 67class ExpectedWarning: 68 """ 69 An expected warning provides a heads-up that a warning containing 70 certain text is expected, so that getting such a warning won't fail a 71 check. 72 """ 73 def __init__(self, message_fragment=''): 74 """ 75 A message fragment should be provided, or by default all warnings 76 will be ignored. If a fragment is provided, all warnings which 77 include that fragment as part of their raw HTML code string will 78 be ignored, but other warnings will not be. 79 """ 80 self.fragment = message_fragment 81 82 def unexpected(self, warnings): 83 """ 84 Returns all of the warnings from the given list which *aren't* 85 expected, given this expectation that certain warning(s) might be 86 present. 87 """ 88 return [w for w in warnings if self.fragment not in w] 89 90 91class Expectation: 92 """ 93 An expectation establishes that a specific goal should evaluate to a 94 specific result within a report. These expectations can be tested 95 to make sure that a specification is working as designed. 96 97 To specify which goal the expectation applies to, there are two 98 options: you can provide a fragment of the goal's identifier as a 99 string (it must match exactly one goal; see 100 `potluck.rubrics.Rubric.goals_by_id`), or you can provide a list of 101 strings, each must uniquely match against an item in a report table 102 at a specific level, with the next string matching against that row's 103 sub-table, and so on. These matches are performed in a 104 case-insensitive manner with HTML tags stripped out, against the 105 primary obfuscated description entry for each goal/category. The 106 specified string only has to match part of the goal description, but 107 it must not match multiple goal descriptions at a given table level. 108 The characters '^^^' are added to the beginning of the rubric string, 109 and '$$$' to the end, to aid in disambiguation. 110 111 Because of these matching rules, for a rubric where the standard 112 metric `potluck.rubrics.core_extras_categorized_metric` is used, 113 goal paths are usually straightforward to construct when default 114 descriptions are in place. Some examples of both id-fragment and 115 goal-path methods: 116 117 - For a core FunctionDef Check for function 'foo': 118 `"core.check:def-foo$"` OR 119 `[ "procedure", "core", "define foo" ]` 120 121 - For an extra FunctionCall Check for 'bar' as a sub-rule of the 122 check above: 123 `"core.check:def-foo:call-bar$"` OR 124 `[ "procedure", "extra", "define foo", "call bar" ]` 125 126 - For a core trace test of function 'foo', assuming it was created 127 with group_name "trace": 128 `"core.test:foo:trace"` OR 129 `[ "process", "core", "the foo function must" ]` 130 - (note that one could also use:) 131 `"^goal:core.test:foo:trace$"` OR 132 `[ "process", "core", "^the foo function must" ]` 133 134 - For a core result value test of function 'foo' (with no group_name): 135 `"core.test:foo$"` OR 136 `[ "product", "core", "foo returns" ]` 137 (Note for the ID version, the $ is important to distinguish from 138 the case above.) 139 140 - For a core printed output test of function 'foo' (with group_name 141 "output"): 142 `"core.test:foo:output"` OR 143 `[ "behavior", "core", "foo prints" ]` 144 """ 145 def __init__(self, goal_spec, expected_status): 146 """ 147 The goal_spec is a list of strings specifying how to find the 148 goal in a report (strings are matched against descriptions to 149 find sub-tables). Alternatively, the goal_spec may be a single 150 string, which will must match a single goal in the rubric using 151 the same rules as `potluck.rubrics.Rubric.goals_by_id`. The 152 expected evaluation result is also required, which should be one 153 of the strings used for goal statuses (see 154 `potluck.rubrics.Goal`). 155 156 Note that the precise goal_spec list an `Expectation` should have 157 depends on the metric used and the details of how a 158 `potluck.rubrics.Rubric` object formulates its overall report, 159 because any top-level organizational report rows (e.g. for goal 160 types or categories) need to be accounted for. Specifying an 161 identifier fragment doesn't depend on the metric, but requires 162 understanding how identifiers are built up, and in some cases, 163 automatic deduplication of goal identifiers must be accounted 164 for. 165 166 For matching using a goal spec that's a list of strings, the 167 case-folded version of each goal_spec entry is checked using 'in' 168 against a case-folded version of each rubric entry at the 169 relevant level. Exactly 1 rubric entry must match. The rubric 170 entries also have HTML tags stripped out, and have '^^^' added at 171 the front and '$$$' at the end to aid in disambiguation. 172 173 For example, if there are rubric entries named "Bug #1" and 174 "Bug #11", an expectation for the "Bug #1" rubric entry could use 175 "bug #1$" as its goal_spec entry. 176 """ 177 self.goal_spec = goal_spec 178 self.expected_status = expected_status 179 180 def check(self, report): 181 """ 182 Checks whether this expectation is fulfilled in a given report. 183 Returns a tuple containing: 184 1. Either True or False indicating success or failure. 185 2. A string description of why the check failed (or how it 186 succeeded). 187 3. A list of strings containing the full unmodified 188 initial descriptions of each report table row on the path 189 to the row that was checked. If the check failed because 190 it could not find the row it was looking for, this will be 191 None. 192 """ 193 rows_here = report["table"] 194 if rows_here == [] and report["files"] == []: 195 raise ValueError("Report indicates no file to evaluate.") 196 found = None 197 trail = [] 198 199 if isinstance(self.goal_spec, str): 200 candidates = [] 201 all_ids = [] 202 for (row, trail) in all_row_trails(report): 203 if 'id' in row: 204 all_ids.append(row['id']) 205 if self.goal_spec in ('^^^' + row['id'] + '$$$'): 206 candidates.append((row, trail)) 207 208 if ( 209 len(all_ids) == 0 210 and ( 211 report["summary"] 212 == "You did not submit any code for this task." 213 ) 214 ): 215 return ( 216 False, 217 "There was no submission.", 218 None 219 ) 220 221 if len(candidates) == 0: 222 options = '\n'.join( 223 '#' + ident 224 for ident in all_ids 225 ) 226 return ( 227 False, 228 ( 229 f"0 goals matched" 230 f" '#{self.goal_spec}'. Available goal ids" 231 f" are:\n{options}" 232 ), 233 None 234 ) 235 elif len(candidates) > 1: 236 options = '\n'.join( 237 '#' + row['id'] 238 for (row, trail) in candidates 239 ) 240 return ( 241 False, 242 ( 243 f"{len(candidates)} goals matched" 244 f" '#{self.goal_spec}'. Matching goals" 245 f" are:\n{options}" 246 ), 247 None 248 ) 249 250 # We found one match: 251 found, trail = candidates[0] 252 253 # String for reporting where we are 254 where = "In " + ' → '.join(trail) 255 256 else: # we assume it's a collection of strings 257 # Match at each level of our goal path 258 for match_key in self.goal_spec: 259 # Match against descriptions at this level 260 matches_here = [] 261 for row in rows_here: 262 match_against = simplify(row["description"][0]) 263 look_for = match_key.casefold() 264 if look_for in match_against: 265 matches_here.append(row) 266 267 # Check # of matching rows 268 if len(matches_here) != 1: # zero or multiple matches 269 if trail: 270 where = "In " + ' → '.join(trail) 271 else: 272 where = "At the top level of the report" 273 274 options = '\n'.join( 275 row['description'][0] 276 for row in rows_here 277 ) 278 return ( 279 False, 280 ( 281 f"{where}, {len(matches_here)} goals matched" 282 f" '{match_key}'. Goals here are:\n{options}" 283 ), 284 None 285 ) 286 else: # a single match, as required 287 # Record the goal or other table row we found: 288 found = matches_here[0] 289 # Extend our trail 290 trail.append(found["description"][0]) 291 # Enter next level of the table: 292 rows_here = found["subtable"] 293 294 # Strings for reporting our result 295 where = "In " + ' → '.join(trail) 296 297 # "found" should now be the matched goal's report row 298 if found["status"] == self.expected_status: 299 return ( 300 True, 301 f"{where}, confirmed status '{self.expected_status}'.", 302 trail 303 ) 304 else: 305 return ( 306 False, 307 ( 308 f"{where}, status '{found['status']}' did not match" 309 f" expected status '{self.expected_status}'." 310 ), 311 trail 312 ) 313 314 315def check_entire_report( 316 report, 317 all_expectations, 318 default_level=0, 319 require_default="accomplished" 320): 321 """ 322 Given a report and a list of `Expectation` and/or `ExpectedWarning` 323 objects, this function checks each of the expectations within the 324 provided report, returning a tuple containing True or False to 325 indicate success or failure, as well as a multi-line string 326 explaining which checks failed or that all checks succeeded. 327 328 If require_default is provided, then all rubric rows in the report 329 which don't have an explicit `Expectation` provided for them or a 330 sub-row at the given default_level must match the require_default 331 status. Set require_default to None (the default is 'accomplished') 332 to leave non-explicitly-checked rows unchecked. 333 """ 334 explanation = "Some checks failed:\n" 335 coverage = {} 336 succeeded = True 337 unexpected_warnings = report["warnings"] 338 for exp in all_expectations: 339 if isinstance(exp, ExpectedWarning): 340 # Filter out warnings 341 before = len(unexpected_warnings) 342 unexpected_warnings = exp.unexpected(unexpected_warnings) 343 if len(unexpected_warnings) == before: # nothing was filtered 344 succeeded = False 345 explanation += ( 346 f" Expected at least one warning containing the text" 347 f" '{exp.fragment}', but no such warning was present." 348 f"\n (or it was filtered by a different warning" 349 f" expectation.)\n" 350 ) 351 352 elif isinstance(exp, Expectation): 353 # Test goal status 354 success, expl, path = exp.check(report) 355 if path is None: 356 if isinstance(exp.goal_spec, str): 357 gs = '#' + exp.goal_spec 358 else: 359 gs = ' → '.join(exp.goal_spec) 360 raise ValueError( 361 "Unable to find expected goal:\n{}\n{}".format(gs, expl) 362 ) 363 c = coverage 364 for entry in path: 365 c = c.setdefault(entry, {}) 366 c[None] = True 367 if not success: 368 explanation += expl + '\n' 369 succeeded = False 370 371 else: 372 raise TypeError(f"Invalid expectation type: {type(exp)}") 373 374 default_count = 0 375 if require_default is not None: 376 def check_default_statuses(rows, covered, path): 377 """ 378 Checks that the status of every row at a certain default 379 level within the report hierarchy is equal to the required 380 default status. Needs a list of rows at this level of the 381 table, a dictionary of covered paths pertaining to this 382 level of the table, and a list of strings indicating the 383 path taken to get to this part of the table. 384 385 Returns a tuple starting with True or False for success or 386 failure, followed by a string describing the failure(s) or 387 explaining the success. 388 """ 389 nonlocal default_count, default_level, require_default 390 passed = True 391 explanation = "" 392 level = len(path) 393 if level == default_level: # Check each non-covered row 394 for row in rows: 395 desc = row["description"][0] 396 if desc in covered and covered[desc].get(None, False): 397 continue # don't check this covered row 398 else: 399 default_count += 1 400 if row["status"] != require_default: 401 where = "In " + " → ".join(path + [desc]) 402 explanation += ( 403 f"{where} status '{row['status']}' did" 404 f" not match required default status" 405 f" '{require_default}'.\n" 406 ) 407 passed = False 408 else: # Recurse 409 for row in rows: 410 desc = row["description"][0] 411 subtable = row["subtable"] 412 sub_success, sub_expl = check_default_statuses( 413 subtable, 414 covered.get(desc, {}), 415 path + [desc] 416 ) 417 if not sub_success: 418 passed = False 419 explanation += sub_expl 420 421 if passed: 422 explanation = ( 423 f"All non-expected statuses were" 424 f" '{require_default}'." 425 ) 426 return passed, explanation 427 428 default_success, default_expl = check_default_statuses( 429 report["table"], 430 coverage, 431 path=[] 432 ) 433 if not default_success: 434 succeeded = False 435 explanation += default_expl 436 437 if succeeded: 438 explanation = "All {} expectation(s){} were met.".format( 439 len(all_expectations), 440 ( 441 f" (plus {default_count} default expectation(s))" 442 if default_count > 0 443 else "" 444 ) 445 ) 446 447 # Check for warnings and replace/augment explanation 448 if len(unexpected_warnings) > 0: 449 wmsg = ( 450 "The report included unexpected warnings:\n " 451 + "\n ".join(unexpected_warnings) 452 ) 453 if succeeded: 454 explanation = wmsg 455 else: 456 explanation = wmsg + '\n' + explanation 457 458 succeeded = False 459 460 return (succeeded, explanation) 461 462 463def example(username, extra_modes=()): 464 """ 465 Registers a current username such that calls to `expect` and/or 466 `expect_validation` create expectations for that example submission, 467 and creates an "evaluation" entry in the expectations table for it so 468 that even if no expectations are established it will still be tested 469 using default expectations. 470 471 If extra_modes is provided, it should be a list of strings naming 472 extra modes to check (e.g., ["validation"]). Note that as soon as an 473 expectation is established for any mode that mode will be checked 474 even if it wasn't specified here. 475 """ 476 global CURRENT_EXAMPLE 477 CURRENT_EXAMPLE = username 478 mname = file_utils.get_spec_module_name() 479 for mode in ["evaluation"] + list(extra_modes): 480 EXPECTATIONS\ 481 .setdefault(mname, {})\ 482 .setdefault(mode, {})\ 483 .setdefault(username, []) 484 485 486def expect(status, *id_or_path, mode="evaluation"): 487 """ 488 Creates an `Expectation` object and registers it under the current 489 example username as an evaluation expectation. 490 491 Arguments are: 492 493 - status: The expected status. See `potluck.rubrics.Goal`. 494 - id_or_path: One or more additional strings specifying which goal we're 495 targeting (see `Expectation`). May also be a single string that 496 starts with '#' to specify the goal using its identifier instead 497 of a rubric-description-path. If it's a single string, it should 498 start with the goal type and then category when using the default 499 rubric metric. 500 - mode: Keyword-only; sets which mode of testing the expectation 501 applies to. Valid modes are "evaluation" (the default) and 502 "validation". 503 """ 504 mname = file_utils.get_spec_module_name() 505 506 if len(id_or_path) == 1 and id_or_path[0].startswith('#'): 507 goal_spec = id_or_path[0][1:] 508 else: 509 goal_spec = id_or_path 510 511 EXPECTATIONS\ 512 .setdefault(mname, {})\ 513 .setdefault(mode, {})\ 514 .setdefault(CURRENT_EXAMPLE, [])\ 515 .append( 516 Expectation( 517 goal_spec, 518 status 519 ) 520 ) 521 522 523def expect_validation(*args): 524 """ 525 Establishes an expectation for the validation step. This is just a 526 shortcut for calling `expect` with mode set to "validation". 527 """ 528 expect(*args, mode="validation") 529 530 531def expect_warnings(fragment='', mode="evaluation"): 532 """ 533 Creates an `ExpectedWarning` object and registers it under the 534 current example username. Registers for evaluation by default, but 535 you can specify a different mode (e.g., "validation"). 536 537 The `fragment` argument if omitted will cause all warnings to be 538 treated as expected, but if provided, only warnings whose raw HTML 539 message string contains that fragment as a substring will be treated 540 as expected. 541 """ 542 mname = file_utils.get_spec_module_name() 543 544 EXPECTATIONS\ 545 .setdefault(mname, {})\ 546 .setdefault(mode, {})\ 547 .setdefault(CURRENT_EXAMPLE, [])\ 548 .append(ExpectedWarning(fragment)) 549 550 551def get_expectations(spec_module, mode="evaluation"): 552 """ 553 Returns all expectations for the given specification module and mode, 554 as a dictionary mapping user IDs to expectation lists. Returns None 555 if there are no expectations for the target mode or for the target 556 module, and an empty expectation set hasn't been set up either. 557 """ 558 return EXPECTATIONS.get(spec_module.__name__, {}).get(mode, None)
Global storage for expectations by spec module name, mode, and username. Entries are module names with mode dictionaries as values, whose keys are modes (i.e., "evaluation" or "validation") and whose values are dictionaries that map usernames to lists of expectations.
Which username are expectations automatically registered for?
28def simplify(description): 29 """ 30 Normalizes case and removes HTML tags from the given goal 31 description, for use in expectation matching. Note that angle 32 brackets which aren't used for HTML tags are assumed to already be 33 escaped. Adds '^^^' at the start and '$$$' at the end so that rules 34 can use those anchors (or part of them) for disambiguation. 35 """ 36 stripped = re.sub(r"<[^>]*>", '', description) 37 return '^^^' + stripped.casefold() + '$$$'
Normalizes case and removes HTML tags from the given goal description, for use in expectation matching. Note that angle brackets which aren't used for HTML tags are assumed to already be escaped. Adds '^^^' at the start and '$$$' at the end so that rules can use those anchors (or part of them) for disambiguation.
40def all_row_trails(report_or_row, trail_prefix=None): 41 """ 42 Visits each row & sub-row of a report (or a report row) one by one. 43 For each row visited, it yields a tuple containing that row, followed 44 by a trail: a list of the first description entry of each ancestor of 45 that row, starting from the top-level ancestor and going down and 46 including that row. 47 48 If provided the given trail prefix (a list of strings) will be 49 included before the start of each trail. 50 """ 51 if trail_prefix is None: 52 trail_prefix = [] 53 54 if "table" in report_or_row: 55 for entry in report_or_row["table"]: 56 yield from all_row_trails(entry, trail_prefix) 57 else: 58 desc = report_or_row["description"][0] 59 below = trail_prefix + [ desc ] 60 if "subtable" in report_or_row: 61 yield (report_or_row, below) 62 for entry in report_or_row["subtable"]: 63 yield from all_row_trails(entry, below) 64 else: 65 yield (report_or_row, below)
Visits each row & sub-row of a report (or a report row) one by one. For each row visited, it yields a tuple containing that row, followed by a trail: a list of the first description entry of each ancestor of that row, starting from the top-level ancestor and going down and including that row.
If provided the given trail prefix (a list of strings) will be included before the start of each trail.
68class ExpectedWarning: 69 """ 70 An expected warning provides a heads-up that a warning containing 71 certain text is expected, so that getting such a warning won't fail a 72 check. 73 """ 74 def __init__(self, message_fragment=''): 75 """ 76 A message fragment should be provided, or by default all warnings 77 will be ignored. If a fragment is provided, all warnings which 78 include that fragment as part of their raw HTML code string will 79 be ignored, but other warnings will not be. 80 """ 81 self.fragment = message_fragment 82 83 def unexpected(self, warnings): 84 """ 85 Returns all of the warnings from the given list which *aren't* 86 expected, given this expectation that certain warning(s) might be 87 present. 88 """ 89 return [w for w in warnings if self.fragment not in w]
An expected warning provides a heads-up that a warning containing certain text is expected, so that getting such a warning won't fail a check.
74 def __init__(self, message_fragment=''): 75 """ 76 A message fragment should be provided, or by default all warnings 77 will be ignored. If a fragment is provided, all warnings which 78 include that fragment as part of their raw HTML code string will 79 be ignored, but other warnings will not be. 80 """ 81 self.fragment = message_fragment
A message fragment should be provided, or by default all warnings will be ignored. If a fragment is provided, all warnings which include that fragment as part of their raw HTML code string will be ignored, but other warnings will not be.
83 def unexpected(self, warnings): 84 """ 85 Returns all of the warnings from the given list which *aren't* 86 expected, given this expectation that certain warning(s) might be 87 present. 88 """ 89 return [w for w in warnings if self.fragment not in w]
Returns all of the warnings from the given list which aren't expected, given this expectation that certain warning(s) might be present.
92class Expectation: 93 """ 94 An expectation establishes that a specific goal should evaluate to a 95 specific result within a report. These expectations can be tested 96 to make sure that a specification is working as designed. 97 98 To specify which goal the expectation applies to, there are two 99 options: you can provide a fragment of the goal's identifier as a 100 string (it must match exactly one goal; see 101 `potluck.rubrics.Rubric.goals_by_id`), or you can provide a list of 102 strings, each must uniquely match against an item in a report table 103 at a specific level, with the next string matching against that row's 104 sub-table, and so on. These matches are performed in a 105 case-insensitive manner with HTML tags stripped out, against the 106 primary obfuscated description entry for each goal/category. The 107 specified string only has to match part of the goal description, but 108 it must not match multiple goal descriptions at a given table level. 109 The characters '^^^' are added to the beginning of the rubric string, 110 and '$$$' to the end, to aid in disambiguation. 111 112 Because of these matching rules, for a rubric where the standard 113 metric `potluck.rubrics.core_extras_categorized_metric` is used, 114 goal paths are usually straightforward to construct when default 115 descriptions are in place. Some examples of both id-fragment and 116 goal-path methods: 117 118 - For a core FunctionDef Check for function 'foo': 119 `"core.check:def-foo$"` OR 120 `[ "procedure", "core", "define foo" ]` 121 122 - For an extra FunctionCall Check for 'bar' as a sub-rule of the 123 check above: 124 `"core.check:def-foo:call-bar$"` OR 125 `[ "procedure", "extra", "define foo", "call bar" ]` 126 127 - For a core trace test of function 'foo', assuming it was created 128 with group_name "trace": 129 `"core.test:foo:trace"` OR 130 `[ "process", "core", "the foo function must" ]` 131 - (note that one could also use:) 132 `"^goal:core.test:foo:trace$"` OR 133 `[ "process", "core", "^the foo function must" ]` 134 135 - For a core result value test of function 'foo' (with no group_name): 136 `"core.test:foo$"` OR 137 `[ "product", "core", "foo returns" ]` 138 (Note for the ID version, the $ is important to distinguish from 139 the case above.) 140 141 - For a core printed output test of function 'foo' (with group_name 142 "output"): 143 `"core.test:foo:output"` OR 144 `[ "behavior", "core", "foo prints" ]` 145 """ 146 def __init__(self, goal_spec, expected_status): 147 """ 148 The goal_spec is a list of strings specifying how to find the 149 goal in a report (strings are matched against descriptions to 150 find sub-tables). Alternatively, the goal_spec may be a single 151 string, which will must match a single goal in the rubric using 152 the same rules as `potluck.rubrics.Rubric.goals_by_id`. The 153 expected evaluation result is also required, which should be one 154 of the strings used for goal statuses (see 155 `potluck.rubrics.Goal`). 156 157 Note that the precise goal_spec list an `Expectation` should have 158 depends on the metric used and the details of how a 159 `potluck.rubrics.Rubric` object formulates its overall report, 160 because any top-level organizational report rows (e.g. for goal 161 types or categories) need to be accounted for. Specifying an 162 identifier fragment doesn't depend on the metric, but requires 163 understanding how identifiers are built up, and in some cases, 164 automatic deduplication of goal identifiers must be accounted 165 for. 166 167 For matching using a goal spec that's a list of strings, the 168 case-folded version of each goal_spec entry is checked using 'in' 169 against a case-folded version of each rubric entry at the 170 relevant level. Exactly 1 rubric entry must match. The rubric 171 entries also have HTML tags stripped out, and have '^^^' added at 172 the front and '$$$' at the end to aid in disambiguation. 173 174 For example, if there are rubric entries named "Bug #1" and 175 "Bug #11", an expectation for the "Bug #1" rubric entry could use 176 "bug #1$" as its goal_spec entry. 177 """ 178 self.goal_spec = goal_spec 179 self.expected_status = expected_status 180 181 def check(self, report): 182 """ 183 Checks whether this expectation is fulfilled in a given report. 184 Returns a tuple containing: 185 1. Either True or False indicating success or failure. 186 2. A string description of why the check failed (or how it 187 succeeded). 188 3. A list of strings containing the full unmodified 189 initial descriptions of each report table row on the path 190 to the row that was checked. If the check failed because 191 it could not find the row it was looking for, this will be 192 None. 193 """ 194 rows_here = report["table"] 195 if rows_here == [] and report["files"] == []: 196 raise ValueError("Report indicates no file to evaluate.") 197 found = None 198 trail = [] 199 200 if isinstance(self.goal_spec, str): 201 candidates = [] 202 all_ids = [] 203 for (row, trail) in all_row_trails(report): 204 if 'id' in row: 205 all_ids.append(row['id']) 206 if self.goal_spec in ('^^^' + row['id'] + '$$$'): 207 candidates.append((row, trail)) 208 209 if ( 210 len(all_ids) == 0 211 and ( 212 report["summary"] 213 == "You did not submit any code for this task." 214 ) 215 ): 216 return ( 217 False, 218 "There was no submission.", 219 None 220 ) 221 222 if len(candidates) == 0: 223 options = '\n'.join( 224 '#' + ident 225 for ident in all_ids 226 ) 227 return ( 228 False, 229 ( 230 f"0 goals matched" 231 f" '#{self.goal_spec}'. Available goal ids" 232 f" are:\n{options}" 233 ), 234 None 235 ) 236 elif len(candidates) > 1: 237 options = '\n'.join( 238 '#' + row['id'] 239 for (row, trail) in candidates 240 ) 241 return ( 242 False, 243 ( 244 f"{len(candidates)} goals matched" 245 f" '#{self.goal_spec}'. Matching goals" 246 f" are:\n{options}" 247 ), 248 None 249 ) 250 251 # We found one match: 252 found, trail = candidates[0] 253 254 # String for reporting where we are 255 where = "In " + ' → '.join(trail) 256 257 else: # we assume it's a collection of strings 258 # Match at each level of our goal path 259 for match_key in self.goal_spec: 260 # Match against descriptions at this level 261 matches_here = [] 262 for row in rows_here: 263 match_against = simplify(row["description"][0]) 264 look_for = match_key.casefold() 265 if look_for in match_against: 266 matches_here.append(row) 267 268 # Check # of matching rows 269 if len(matches_here) != 1: # zero or multiple matches 270 if trail: 271 where = "In " + ' → '.join(trail) 272 else: 273 where = "At the top level of the report" 274 275 options = '\n'.join( 276 row['description'][0] 277 for row in rows_here 278 ) 279 return ( 280 False, 281 ( 282 f"{where}, {len(matches_here)} goals matched" 283 f" '{match_key}'. Goals here are:\n{options}" 284 ), 285 None 286 ) 287 else: # a single match, as required 288 # Record the goal or other table row we found: 289 found = matches_here[0] 290 # Extend our trail 291 trail.append(found["description"][0]) 292 # Enter next level of the table: 293 rows_here = found["subtable"] 294 295 # Strings for reporting our result 296 where = "In " + ' → '.join(trail) 297 298 # "found" should now be the matched goal's report row 299 if found["status"] == self.expected_status: 300 return ( 301 True, 302 f"{where}, confirmed status '{self.expected_status}'.", 303 trail 304 ) 305 else: 306 return ( 307 False, 308 ( 309 f"{where}, status '{found['status']}' did not match" 310 f" expected status '{self.expected_status}'." 311 ), 312 trail 313 )
An expectation establishes that a specific goal should evaluate to a specific result within a report. These expectations can be tested to make sure that a specification is working as designed.
To specify which goal the expectation applies to, there are two
options: you can provide a fragment of the goal's identifier as a
string (it must match exactly one goal; see
potluck.rubrics.Rubric.goals_by_id
), or you can provide a list of
strings, each must uniquely match against an item in a report table
at a specific level, with the next string matching against that row's
sub-table, and so on. These matches are performed in a
case-insensitive manner with HTML tags stripped out, against the
primary obfuscated description entry for each goal/category. The
specified string only has to match part of the goal description, but
it must not match multiple goal descriptions at a given table level.
The characters '^^^' are added to the beginning of the rubric string,
and '$$$' to the end, to aid in disambiguation.
Because of these matching rules, for a rubric where the standard
metric potluck.rubrics.core_extras_categorized_metric
is used,
goal paths are usually straightforward to construct when default
descriptions are in place. Some examples of both id-fragment and
goal-path methods:
For a core FunctionDef Check for function 'foo':
"core.check:def-foo$"
OR[ "procedure", "core", "define foo" ]
For an extra FunctionCall Check for 'bar' as a sub-rule of the check above:
"core.check:def-foo:call-bar$"
OR[ "procedure", "extra", "define foo", "call bar" ]
For a core trace test of function 'foo', assuming it was created with group_name "trace":
"core.test:foo:trace"
OR[ "process", "core", "the foo function must" ]
(note that one could also use:)
"^goal:core.test:foo:trace$"
OR[ "process", "core", "^the foo function must" ]
For a core result value test of function 'foo' (with no group_name):
"core.test:foo$"
OR[ "product", "core", "foo returns" ]
(Note for the ID version, the $ is important to distinguish from the case above.)For a core printed output test of function 'foo' (with group_name "output"):
"core.test:foo:output"
OR[ "behavior", "core", "foo prints" ]
146 def __init__(self, goal_spec, expected_status): 147 """ 148 The goal_spec is a list of strings specifying how to find the 149 goal in a report (strings are matched against descriptions to 150 find sub-tables). Alternatively, the goal_spec may be a single 151 string, which will must match a single goal in the rubric using 152 the same rules as `potluck.rubrics.Rubric.goals_by_id`. The 153 expected evaluation result is also required, which should be one 154 of the strings used for goal statuses (see 155 `potluck.rubrics.Goal`). 156 157 Note that the precise goal_spec list an `Expectation` should have 158 depends on the metric used and the details of how a 159 `potluck.rubrics.Rubric` object formulates its overall report, 160 because any top-level organizational report rows (e.g. for goal 161 types or categories) need to be accounted for. Specifying an 162 identifier fragment doesn't depend on the metric, but requires 163 understanding how identifiers are built up, and in some cases, 164 automatic deduplication of goal identifiers must be accounted 165 for. 166 167 For matching using a goal spec that's a list of strings, the 168 case-folded version of each goal_spec entry is checked using 'in' 169 against a case-folded version of each rubric entry at the 170 relevant level. Exactly 1 rubric entry must match. The rubric 171 entries also have HTML tags stripped out, and have '^^^' added at 172 the front and '$$$' at the end to aid in disambiguation. 173 174 For example, if there are rubric entries named "Bug #1" and 175 "Bug #11", an expectation for the "Bug #1" rubric entry could use 176 "bug #1$" as its goal_spec entry. 177 """ 178 self.goal_spec = goal_spec 179 self.expected_status = expected_status
The goal_spec is a list of strings specifying how to find the
goal in a report (strings are matched against descriptions to
find sub-tables). Alternatively, the goal_spec may be a single
string, which will must match a single goal in the rubric using
the same rules as potluck.rubrics.Rubric.goals_by_id
. The
expected evaluation result is also required, which should be one
of the strings used for goal statuses (see
potluck.rubrics.Goal
).
Note that the precise goal_spec list an Expectation
should have
depends on the metric used and the details of how a
potluck.rubrics.Rubric
object formulates its overall report,
because any top-level organizational report rows (e.g. for goal
types or categories) need to be accounted for. Specifying an
identifier fragment doesn't depend on the metric, but requires
understanding how identifiers are built up, and in some cases,
automatic deduplication of goal identifiers must be accounted
for.
For matching using a goal spec that's a list of strings, the case-folded version of each goal_spec entry is checked using 'in' against a case-folded version of each rubric entry at the relevant level. Exactly 1 rubric entry must match. The rubric entries also have HTML tags stripped out, and have '^^^' added at the front and '$$$' at the end to aid in disambiguation.
For example, if there are rubric entries named "Bug #1" and "Bug #11", an expectation for the "Bug #1" rubric entry could use "bug #1$" as its goal_spec entry.
181 def check(self, report): 182 """ 183 Checks whether this expectation is fulfilled in a given report. 184 Returns a tuple containing: 185 1. Either True or False indicating success or failure. 186 2. A string description of why the check failed (or how it 187 succeeded). 188 3. A list of strings containing the full unmodified 189 initial descriptions of each report table row on the path 190 to the row that was checked. If the check failed because 191 it could not find the row it was looking for, this will be 192 None. 193 """ 194 rows_here = report["table"] 195 if rows_here == [] and report["files"] == []: 196 raise ValueError("Report indicates no file to evaluate.") 197 found = None 198 trail = [] 199 200 if isinstance(self.goal_spec, str): 201 candidates = [] 202 all_ids = [] 203 for (row, trail) in all_row_trails(report): 204 if 'id' in row: 205 all_ids.append(row['id']) 206 if self.goal_spec in ('^^^' + row['id'] + '$$$'): 207 candidates.append((row, trail)) 208 209 if ( 210 len(all_ids) == 0 211 and ( 212 report["summary"] 213 == "You did not submit any code for this task." 214 ) 215 ): 216 return ( 217 False, 218 "There was no submission.", 219 None 220 ) 221 222 if len(candidates) == 0: 223 options = '\n'.join( 224 '#' + ident 225 for ident in all_ids 226 ) 227 return ( 228 False, 229 ( 230 f"0 goals matched" 231 f" '#{self.goal_spec}'. Available goal ids" 232 f" are:\n{options}" 233 ), 234 None 235 ) 236 elif len(candidates) > 1: 237 options = '\n'.join( 238 '#' + row['id'] 239 for (row, trail) in candidates 240 ) 241 return ( 242 False, 243 ( 244 f"{len(candidates)} goals matched" 245 f" '#{self.goal_spec}'. Matching goals" 246 f" are:\n{options}" 247 ), 248 None 249 ) 250 251 # We found one match: 252 found, trail = candidates[0] 253 254 # String for reporting where we are 255 where = "In " + ' → '.join(trail) 256 257 else: # we assume it's a collection of strings 258 # Match at each level of our goal path 259 for match_key in self.goal_spec: 260 # Match against descriptions at this level 261 matches_here = [] 262 for row in rows_here: 263 match_against = simplify(row["description"][0]) 264 look_for = match_key.casefold() 265 if look_for in match_against: 266 matches_here.append(row) 267 268 # Check # of matching rows 269 if len(matches_here) != 1: # zero or multiple matches 270 if trail: 271 where = "In " + ' → '.join(trail) 272 else: 273 where = "At the top level of the report" 274 275 options = '\n'.join( 276 row['description'][0] 277 for row in rows_here 278 ) 279 return ( 280 False, 281 ( 282 f"{where}, {len(matches_here)} goals matched" 283 f" '{match_key}'. Goals here are:\n{options}" 284 ), 285 None 286 ) 287 else: # a single match, as required 288 # Record the goal or other table row we found: 289 found = matches_here[0] 290 # Extend our trail 291 trail.append(found["description"][0]) 292 # Enter next level of the table: 293 rows_here = found["subtable"] 294 295 # Strings for reporting our result 296 where = "In " + ' → '.join(trail) 297 298 # "found" should now be the matched goal's report row 299 if found["status"] == self.expected_status: 300 return ( 301 True, 302 f"{where}, confirmed status '{self.expected_status}'.", 303 trail 304 ) 305 else: 306 return ( 307 False, 308 ( 309 f"{where}, status '{found['status']}' did not match" 310 f" expected status '{self.expected_status}'." 311 ), 312 trail 313 )
Checks whether this expectation is fulfilled in a given report. Returns a tuple containing: 1. Either True or False indicating success or failure. 2. A string description of why the check failed (or how it succeeded). 3. A list of strings containing the full unmodified initial descriptions of each report table row on the path to the row that was checked. If the check failed because it could not find the row it was looking for, this will be None.
316def check_entire_report( 317 report, 318 all_expectations, 319 default_level=0, 320 require_default="accomplished" 321): 322 """ 323 Given a report and a list of `Expectation` and/or `ExpectedWarning` 324 objects, this function checks each of the expectations within the 325 provided report, returning a tuple containing True or False to 326 indicate success or failure, as well as a multi-line string 327 explaining which checks failed or that all checks succeeded. 328 329 If require_default is provided, then all rubric rows in the report 330 which don't have an explicit `Expectation` provided for them or a 331 sub-row at the given default_level must match the require_default 332 status. Set require_default to None (the default is 'accomplished') 333 to leave non-explicitly-checked rows unchecked. 334 """ 335 explanation = "Some checks failed:\n" 336 coverage = {} 337 succeeded = True 338 unexpected_warnings = report["warnings"] 339 for exp in all_expectations: 340 if isinstance(exp, ExpectedWarning): 341 # Filter out warnings 342 before = len(unexpected_warnings) 343 unexpected_warnings = exp.unexpected(unexpected_warnings) 344 if len(unexpected_warnings) == before: # nothing was filtered 345 succeeded = False 346 explanation += ( 347 f" Expected at least one warning containing the text" 348 f" '{exp.fragment}', but no such warning was present." 349 f"\n (or it was filtered by a different warning" 350 f" expectation.)\n" 351 ) 352 353 elif isinstance(exp, Expectation): 354 # Test goal status 355 success, expl, path = exp.check(report) 356 if path is None: 357 if isinstance(exp.goal_spec, str): 358 gs = '#' + exp.goal_spec 359 else: 360 gs = ' → '.join(exp.goal_spec) 361 raise ValueError( 362 "Unable to find expected goal:\n{}\n{}".format(gs, expl) 363 ) 364 c = coverage 365 for entry in path: 366 c = c.setdefault(entry, {}) 367 c[None] = True 368 if not success: 369 explanation += expl + '\n' 370 succeeded = False 371 372 else: 373 raise TypeError(f"Invalid expectation type: {type(exp)}") 374 375 default_count = 0 376 if require_default is not None: 377 def check_default_statuses(rows, covered, path): 378 """ 379 Checks that the status of every row at a certain default 380 level within the report hierarchy is equal to the required 381 default status. Needs a list of rows at this level of the 382 table, a dictionary of covered paths pertaining to this 383 level of the table, and a list of strings indicating the 384 path taken to get to this part of the table. 385 386 Returns a tuple starting with True or False for success or 387 failure, followed by a string describing the failure(s) or 388 explaining the success. 389 """ 390 nonlocal default_count, default_level, require_default 391 passed = True 392 explanation = "" 393 level = len(path) 394 if level == default_level: # Check each non-covered row 395 for row in rows: 396 desc = row["description"][0] 397 if desc in covered and covered[desc].get(None, False): 398 continue # don't check this covered row 399 else: 400 default_count += 1 401 if row["status"] != require_default: 402 where = "In " + " → ".join(path + [desc]) 403 explanation += ( 404 f"{where} status '{row['status']}' did" 405 f" not match required default status" 406 f" '{require_default}'.\n" 407 ) 408 passed = False 409 else: # Recurse 410 for row in rows: 411 desc = row["description"][0] 412 subtable = row["subtable"] 413 sub_success, sub_expl = check_default_statuses( 414 subtable, 415 covered.get(desc, {}), 416 path + [desc] 417 ) 418 if not sub_success: 419 passed = False 420 explanation += sub_expl 421 422 if passed: 423 explanation = ( 424 f"All non-expected statuses were" 425 f" '{require_default}'." 426 ) 427 return passed, explanation 428 429 default_success, default_expl = check_default_statuses( 430 report["table"], 431 coverage, 432 path=[] 433 ) 434 if not default_success: 435 succeeded = False 436 explanation += default_expl 437 438 if succeeded: 439 explanation = "All {} expectation(s){} were met.".format( 440 len(all_expectations), 441 ( 442 f" (plus {default_count} default expectation(s))" 443 if default_count > 0 444 else "" 445 ) 446 ) 447 448 # Check for warnings and replace/augment explanation 449 if len(unexpected_warnings) > 0: 450 wmsg = ( 451 "The report included unexpected warnings:\n " 452 + "\n ".join(unexpected_warnings) 453 ) 454 if succeeded: 455 explanation = wmsg 456 else: 457 explanation = wmsg + '\n' + explanation 458 459 succeeded = False 460 461 return (succeeded, explanation)
Given a report and a list of Expectation
and/or ExpectedWarning
objects, this function checks each of the expectations within the
provided report, returning a tuple containing True or False to
indicate success or failure, as well as a multi-line string
explaining which checks failed or that all checks succeeded.
If require_default is provided, then all rubric rows in the report
which don't have an explicit Expectation
provided for them or a
sub-row at the given default_level must match the require_default
status. Set require_default to None (the default is 'accomplished')
to leave non-explicitly-checked rows unchecked.
464def example(username, extra_modes=()): 465 """ 466 Registers a current username such that calls to `expect` and/or 467 `expect_validation` create expectations for that example submission, 468 and creates an "evaluation" entry in the expectations table for it so 469 that even if no expectations are established it will still be tested 470 using default expectations. 471 472 If extra_modes is provided, it should be a list of strings naming 473 extra modes to check (e.g., ["validation"]). Note that as soon as an 474 expectation is established for any mode that mode will be checked 475 even if it wasn't specified here. 476 """ 477 global CURRENT_EXAMPLE 478 CURRENT_EXAMPLE = username 479 mname = file_utils.get_spec_module_name() 480 for mode in ["evaluation"] + list(extra_modes): 481 EXPECTATIONS\ 482 .setdefault(mname, {})\ 483 .setdefault(mode, {})\ 484 .setdefault(username, [])
Registers a current username such that calls to expect
and/or
expect_validation
create expectations for that example submission,
and creates an "evaluation" entry in the expectations table for it so
that even if no expectations are established it will still be tested
using default expectations.
If extra_modes is provided, it should be a list of strings naming extra modes to check (e.g., ["validation"]). Note that as soon as an expectation is established for any mode that mode will be checked even if it wasn't specified here.
487def expect(status, *id_or_path, mode="evaluation"): 488 """ 489 Creates an `Expectation` object and registers it under the current 490 example username as an evaluation expectation. 491 492 Arguments are: 493 494 - status: The expected status. See `potluck.rubrics.Goal`. 495 - id_or_path: One or more additional strings specifying which goal we're 496 targeting (see `Expectation`). May also be a single string that 497 starts with '#' to specify the goal using its identifier instead 498 of a rubric-description-path. If it's a single string, it should 499 start with the goal type and then category when using the default 500 rubric metric. 501 - mode: Keyword-only; sets which mode of testing the expectation 502 applies to. Valid modes are "evaluation" (the default) and 503 "validation". 504 """ 505 mname = file_utils.get_spec_module_name() 506 507 if len(id_or_path) == 1 and id_or_path[0].startswith('#'): 508 goal_spec = id_or_path[0][1:] 509 else: 510 goal_spec = id_or_path 511 512 EXPECTATIONS\ 513 .setdefault(mname, {})\ 514 .setdefault(mode, {})\ 515 .setdefault(CURRENT_EXAMPLE, [])\ 516 .append( 517 Expectation( 518 goal_spec, 519 status 520 ) 521 )
Creates an Expectation
object and registers it under the current
example username as an evaluation expectation.
Arguments are:
- status: The expected status. See
potluck.rubrics.Goal
. - id_or_path: One or more additional strings specifying which goal we're
targeting (see
Expectation
). May also be a single string that starts with '#' to specify the goal using its identifier instead of a rubric-description-path. If it's a single string, it should start with the goal type and then category when using the default rubric metric. - mode: Keyword-only; sets which mode of testing the expectation applies to. Valid modes are "evaluation" (the default) and "validation".
524def expect_validation(*args): 525 """ 526 Establishes an expectation for the validation step. This is just a 527 shortcut for calling `expect` with mode set to "validation". 528 """ 529 expect(*args, mode="validation")
Establishes an expectation for the validation step. This is just a
shortcut for calling expect
with mode set to "validation".
532def expect_warnings(fragment='', mode="evaluation"): 533 """ 534 Creates an `ExpectedWarning` object and registers it under the 535 current example username. Registers for evaluation by default, but 536 you can specify a different mode (e.g., "validation"). 537 538 The `fragment` argument if omitted will cause all warnings to be 539 treated as expected, but if provided, only warnings whose raw HTML 540 message string contains that fragment as a substring will be treated 541 as expected. 542 """ 543 mname = file_utils.get_spec_module_name() 544 545 EXPECTATIONS\ 546 .setdefault(mname, {})\ 547 .setdefault(mode, {})\ 548 .setdefault(CURRENT_EXAMPLE, [])\ 549 .append(ExpectedWarning(fragment))
Creates an ExpectedWarning
object and registers it under the
current example username. Registers for evaluation by default, but
you can specify a different mode (e.g., "validation").
The fragment
argument if omitted will cause all warnings to be
treated as expected, but if provided, only warnings whose raw HTML
message string contains that fragment as a substring will be treated
as expected.
552def get_expectations(spec_module, mode="evaluation"): 553 """ 554 Returns all expectations for the given specification module and mode, 555 as a dictionary mapping user IDs to expectation lists. Returns None 556 if there are no expectations for the target mode or for the target 557 module, and an empty expectation set hasn't been set up either. 558 """ 559 return EXPECTATIONS.get(spec_module.__name__, {}).get(mode, None)
Returns all expectations for the given specification module and mode, as a dictionary mapping user IDs to expectation lists. Returns None if there are no expectations for the target mode or for the target module, and an empty expectation set hasn't been set up either.