tests/unit/test_csvcmp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

"""Tests for gn3.csvcmp"""
import pytest

from gn3.csvcmp import csv_diff
from gn3.csvcmp import extract_invalid_csv_headers
from gn3.csvcmp import fill_csv
from gn3.csvcmp import get_allowable_sampledata_headers
from gn3.csvcmp import remove_insignificant_edits
from gn3.csvcmp import extract_strain_name


@pytest.mark.unit_test
def test_fill_csv():
    """Test that filling a csv works properly"""
    test_input = """
Strain Name,Value,SE,Count,Sex
BXD1,18,x,0,
BXD12,16,x,x,
BXD14,15,x,x,
BXD15,14,x,x
"""
    expected_output = """Strain Name,Value,SE,Count,Sex
BXD1,18,x,0,x
BXD12,16,x,x,x
BXD14,15,x,x,x
BXD15,14,x,x,x"""
    assert fill_csv(test_input, width=5, value="x") == expected_output


@pytest.mark.unit_test
def test_remove_insignificant_data():
    """Test that values outside ε are removed/ ignored"""
    diff_data = {
        "Additions": [],
        "Deletions": [],
        "Modifications": [
            {"Current": "1.000001,3", "Original": "1,3"},
            {"Current": "1,3", "Original": "1.000001,3"},
            {"Current": "2.000001,3", "Original": "2,2"},
            {"Current": "1.01,3", "Original": "1,2"},
        ],
    }
    expected_json = {
        "Additions": [],
        "Deletions": [],
        "Modifications": [
            {"Current": "2,3", "Original": "2,2"},
            {"Current": "1.01,3", "Original": "1,2"},
        ],
    }
    assert remove_insignificant_edits(diff_data) == expected_json


@pytest.mark.unit_test
def test_csv_diff_same_columns():
    """Test csv diffing on data with the same number of columns"""
    assert csv_diff(base_csv="a,b\n1,2\n", delta_csv="a,b\n1,3") == {
        "Additions": [],
        "Deletions": [],
        "Columns": "",
        "Modifications": [{"Current": "1,3", "Original": "1,2"}],
    }


@pytest.mark.unit_test
def test_csv_diff_different_columns():
    """Test csv diffing on data with different columns"""
    base_csv = """
Strain Name,Value,SE,Count
BXD1,18,x,0
BXD12,16,x,x
BXD14,15,x,x
BXD15,14,x,x
"""
    delta_csv = """Strain Name,Value,SE,Count,Sex
BXD1,18,x,0
BXD12,16,x,x,1
BXD14,15,x,x
BXD15,14,x,x"""
    assert csv_diff(base_csv=base_csv, delta_csv=delta_csv) == {
        "Additions": [],
        "Columns": "Strain Name,Value,SE,Count,Sex",
        "Deletions": [],
        "Modifications": [{"Current": "BXD12,16,x,x,1", "Original": "BXD12,16,x,x,x"}],
    }


@pytest.mark.unit_test
def test_csv_diff_only_column_change():
    """Test csv diffing when only the column header change"""
    base_csv = """
Strain Name,Value,SE,Count
BXD1,18,x,0
BXD12,16,x,x
BXD14,15,x,x
BXD15,14,x,x
"""
    delta_csv = """Strain Name,Value,SE,Count,Sex
BXD1,18,x,0
BXD12,16,x,x
BXD14,15,x,x
BXD15,14,x,x
"""
    assert csv_diff(base_csv=base_csv, delta_csv=delta_csv) == {
        "Additions": [],
        "Deletions": [],
        "Modifications": [],
    }


@pytest.mark.unit_test
def test_extract_strain_name():
    """Test that the strain's name is extracted given a csv header"""
    assert (
        extract_strain_name(csv_header="Strain Name,Value,SE,Count", data="BXD1,18,x,0")
        == "BXD1"
    )


@pytest.mark.unit_test
def test_get_allowable_csv_headers(mocker):
    """Test that all the csv headers are fetched properly"""
    mock_conn = mocker.MagicMock()
    expected_values = [
        "Strain Name", "Value", "SE", "Count",
        "Condition", "Tissue", "Sex", "Age",
        "Ethn.", "PMI (hrs)", "pH", "Color",
    ]
    with mock_conn.cursor() as cursor:
        cursor.fetchall.return_value = (
            ('Condition',), ('Tissue',), ('Sex',),
            ('Age',), ('Ethn.',), ('PMI (hrs)',), ('pH',), ('Color',))
        assert get_allowable_sampledata_headers(mock_conn) == expected_values
        cursor.execute.assert_called_once_with(
            "SELECT Name from CaseAttribute")


@pytest.mark.unit_test
def test_extract_invalid_csv_headers_with_some_wrong_headers():
    """Test that invalid column headers are extracted correctly from a csv
string"""
    allowed_headers = [
        "Strain Name", "Value", "SE", "Count",
        "Condition", "Tissue", "Sex", "Age",
        "Ethn.", "PMI (hrs)", "pH", "Color",
    ]

    csv_text = "Strain Name, Value, SE, Colour"
    assert extract_invalid_csv_headers(allowed_headers, csv_text) == ["Colour"]