Source code for mdvtools.tests.test_bad_data

import pandas as pd
from mdvtools.mdvproject import MDVProject
import os
import json



[docs]
def test_bad_data():
    """
    Test what happens with NaN, Infinity...
    Previously, although there was code for filtering `na = na[~numpy.isnan(na)]`,
    this didn't help if there also happened to be Infinity.

    The `json.dumps` default behaviour of `allow_nan=True` would then cheerfully output non-compliant JSON,
    which contrary to what the Python documentation states is not, as of this writing (2024-02-05),
    compatible with *any* standard JavaScript based decoders I tested (let alone 'most').

    We now use `allow_nan=False`, so the user would be alerted at project creation time rather than runtime,
    but also correct the error earlier in the process so that the particular case of `isinf` is handled.

    If there are no valid numbers at all in a numeric column, then an exception will be thrown,
    in this example we catch that and as of now, end up with a datasource with 0 columns.
    """

    path = os.path.join(os.path.dirname(__file__), "temp", "test_bad_data")
    if not os.path.exists(path):
        os.makedirs(path)
    p = MDVProject(path, delete_existing=True)
    df_good = pd.DataFrame({"a": [42]})
    df_mixed = pd.DataFrame({"a": [42, float("nan"), float("inf")]})
    # when explicitly setting a numeric column which we want to interpret as 'text' (so it can be a category, e.g. clusterID)
    df_bad_text = pd.DataFrame({"a": [0, 1, float("nan")]})
    df_bad = pd.DataFrame({"a": [float("nan")]})
    try:
        p.add_datasource("good data", df_good)
        assert len(p.datasources[0]["columns"]) == 1
        print("good data added ok")
        p.add_datasource("mixed data", df_mixed)
        assert len(p.datasources[1]["columns"]) == 1
        print(
            "mixed data added ok, column metadata should not contain any NaN/Infinity etc:"
        )
        print(f"{json.dumps(p.datasources[1]['columns'][0])}")

        # providing 'columns' without a 'datatype' key will cause a KeyError, and the column will be ignored
        # (although the datasource will still think it has a 'length' of 3 even though it has 0 columns)
        bad_columns = p.add_datasource(
            "bad column metadata", df_bad_text, columns=[{"name": "a", "type": "text"}]
        )
        assert len(bad_columns) == 1
        # this was co-ercing 'nan' to 0, which means that there is no difference between 'nan' and 0, which is bad.
        p.add_datasource(
            "bad text data", df_bad_text, columns=[{"name": "a", "datatype": "text"}]
        )
        assert (
            len(p.datasources[3]["columns"][0]["values"]) == 3
        )  # expect ['0', '1', 'nan']
        df_really_not_a_number = pd.DataFrame({"a": [1, 2, float("nan"), "ERROR"]})
        bad_columns = p.add_datasource("really bad data", df_really_not_a_number, columns=[{"name": "a", "datatype": "double"}])

        # expect `ValueError('zero-size array to reduction operation minimum which has no identity')`
        p.add_datasource("bad data", df_bad)
        assert (
            False
        )  # df_bad is sufficiently degenerate that we don't expect to get this far.
    except Exception as e:
        print(
            "There was an exception - was it expected? (yes) Does it help us understand how to fix the problem? What state is the project datasource metadata in after this?"
        )
        print(e)
        print(json.dumps(p.datasources, indent=2))


    # p.serve(port=5055)