Source code for mdvtools.tests.test_bad_data

import pandas as pd
from mdvtools.mdvproject import MDVProject
import os
import json


[docs] def test_bad_data(): """ Test what happens with NaN, Infinity... Previously, although there was code for filtering `na = na[~numpy.isnan(na)]`, this didn't help if there also happened to be Infinity. The `json.dumps` default behaviour of `allow_nan=True` would then cheerfully output non-compliant JSON, which contrary to what the Python documentation states is not, as of this writing (2024-02-05), compatible with *any* standard JavaScript based decoders I tested (let alone 'most'). We now use `allow_nan=False`, so the user would be alerted at project creation time rather than runtime, but also correct the error earlier in the process so that the particular case of `isinf` is handled. If there are no valid numbers at all in a numeric column, then an exception will be thrown, in this example we catch that and as of now, end up with a datasource with 0 columns. """ path = os.path.join(os.path.dirname(__file__), "temp", "test_bad_data") if not os.path.exists(path): os.makedirs(path) p = MDVProject(path, delete_existing=True) df_good = pd.DataFrame({"a": [42]}) df_mixed = pd.DataFrame({"a": [42, float("nan"), float("inf")]}) # when explicitly setting a numeric column which we want to interpret as 'text' (so it can be a category, e.g. clusterID) df_bad_text = pd.DataFrame({"a": [0, 1, float("nan")]}) df_bad = pd.DataFrame({"a": [float("nan")]}) try: p.add_datasource("good data", df_good) assert len(p.datasources[0]["columns"]) == 1 print("good data added ok") p.add_datasource("mixed data", df_mixed) assert len(p.datasources[1]["columns"]) == 1 print( "mixed data added ok, column metadata should not contain any NaN/Infinity etc:" ) print(f"{json.dumps(p.datasources[1]['columns'][0])}") # providing 'columns' without a 'datatype' key will cause a KeyError, and the column will be ignored # (although the datasource will still think it has a 'length' of 3 even though it has 0 columns) bad_columns = p.add_datasource( "bad column metadata", df_bad_text, columns=[{"name": "a", "type": "text"}] ) assert len(bad_columns) == 1 # this was co-ercing 'nan' to 0, which means that there is no difference between 'nan' and 0, which is bad. p.add_datasource( "bad text data", df_bad_text, columns=[{"name": "a", "datatype": "text"}] ) assert ( len(p.datasources[3]["columns"][0]["values"]) == 3 ) # expect ['0', '1', 'nan'] df_really_not_a_number = pd.DataFrame({"a": [1, 2, float("nan"), "ERROR"]}) bad_columns = p.add_datasource("really bad data", df_really_not_a_number, columns=[{"name": "a", "datatype": "double"}]) # expect `ValueError('zero-size array to reduction operation minimum which has no identity')` p.add_datasource("bad data", df_bad) assert ( False ) # df_bad is sufficiently degenerate that we don't expect to get this far. except Exception as e: print( "There was an exception - was it expected? (yes) Does it help us understand how to fix the problem? What state is the project datasource metadata in after this?" ) print(e) print(json.dumps(p.datasources, indent=2))
# p.serve(port=5055)