tests/unit/test_csvcmp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

"""Tests for gn3.csvcmp"""
import pytest

from gn3.csvcmp import clean_csv_text
from gn3.csvcmp import csv_diff
from gn3.csvcmp import extract_invalid_csv_headers
from gn3.csvcmp import extract_strain_name
from gn3.csvcmp import fill_csv
from gn3.csvcmp import parse_csv_column
from gn3.csvcmp import remove_insignificant_edits


@pytest.mark.unit_test
def test_fill_csv():
    """Test that filling a csv works properly"""
    test_input = """
Strain Name,Value,SE,Count,Sex
BXD1,18,x,0,
BXD12,16,x,x,
BXD14,15,x,x,
BXD15,14,x,x
"""
    expected_output = """Strain Name,Value,SE,Count,Sex
BXD1,18,x,0,x
BXD12,16,x,x,x
BXD14,15,x,x,x
BXD15,14,x,x,x"""
    assert fill_csv(test_input, width=5, value="x") == expected_output


@pytest.mark.unit_test
def test_remove_insignificant_data():
    """Test that values outside ε are removed/ ignored"""
    diff_data = {
        "Additions": [],
        "Deletions": [],
        "Modifications": [
            {"Current": "1.000001,3", "Original": "1,3"},
            {"Current": "1,3", "Original": "1.000001,3"},
            {"Current": "2.000001,3", "Original": "2,2"},
            {"Current": "1.01,3", "Original": "1,2"},
        ],
    }
    expected_json = {
        "Additions": [],
        "Deletions": [],
        "Modifications": [
            {"Current": "2,3", "Original": "2,2"},
            {"Current": "1.01,3", "Original": "1,2"},
        ],
    }
    assert remove_insignificant_edits(diff_data) == expected_json


@pytest.mark.unit_test
def test_csv_diff_same_columns():
    """Test csv diffing on data with the same number of columns"""
    assert csv_diff(base_csv="a,b \n1,2\n", delta_csv="a,b\n1,3") == {
        "Additions": [],
        "Deletions": [],
        "Columns": "",
        "Modifications": [{"Current": "1,3", "Original": "1,2"}],
    }


@pytest.mark.unit_test
def test_csv_diff_different_columns():
    """Test csv diffing on data with different columns"""
    base_csv = """
Strain Name,Value,SE,Count
BXD1,18,x,0
BXD12,16,x,x
BXD14,15,x,x
BXD15,14,x,x
"""
    delta_csv = """Strain Name,Value,SE,Count,Sex
BXD1,18,x,0
BXD12,16,x,x,1
BXD14,15,x,x
BXD15,14,x,x"""
    assert csv_diff(base_csv=base_csv, delta_csv=delta_csv) == {
        "Additions": [],
        "Columns": "Strain Name,Value,SE,Count,Sex",
        "Deletions": [],
        "Modifications": [
            {"Current": "BXD12,16,x,x,1", "Original": "BXD12,16,x,x,x"}
        ],
    }


@pytest.mark.unit_test
def test_csv_diff_only_column_change():
    """Test csv diffing when only the column header change"""
    base_csv = """
Strain Name,Value,SE,Count
BXD1,18,x,0
BXD12,16,x,x
BXD14,15,x,x
BXD15,14,x,x
"""
    delta_csv = """Strain Name,Value,SE,Count,Sex
BXD1,18,x,0
BXD12,16,x,x
BXD14,15,x,x
BXD15,14,x,x
"""
    assert csv_diff(base_csv=base_csv, delta_csv=delta_csv) == {
        "Additions": [],
        "Deletions": [],
        "Modifications": [],
    }


@pytest.mark.unit_test
def test_extract_strain_name():
    """Test that the strain's name is extracted given a csv header"""
    assert (
        extract_strain_name(
            csv_header="Strain Name,Value,SE,Count", data="BXD1,18,x,0"
        )
        == "BXD1"
    )


@pytest.mark.unit_test
def test_extract_invalid_csv_headers_with_some_wrong_headers():
    """Test that invalid column headers are extracted correctly from a csv
    string"""
    allowed_headers = [
        "Strain Name",
        "Value",
        "SE",
        "Count",
        "Condition",
        "Tissue",
        "Sex",
        "Age",
        "Ethn.",
        "PMI (hrs)",
        "pH",
        "Color",
    ]

    csv_text = "Strain Name, Value, SE, Colour"
    assert extract_invalid_csv_headers(allowed_headers, csv_text) == ["Colour"]


@pytest.mark.unit_test
def test_clean_csv():
    """Test that csv text input is cleaned properly"""
    csv_text = """
Strain Name,Value,SE,Count 
BXD1,18,x ,0
BXD12, 16,x,x
BXD14,15 ,x,x
BXD15,14,x,
"""
    expected_csv = """Strain Name,Value,SE,Count
BXD1,18,x,0
BXD12,16,x,x
BXD14,15,x,x
BXD15,14,x,"""

    assert clean_csv_text(csv_text) == expected_csv
    assert clean_csv_text("a,b \n1,2\n") == "a,b\n1,2"


@pytest.mark.unit_test
def test_parse_column_string():
    """Test that a column is parsed correctly"""
    assert parse_csv_column("Header") == (None, "Header")
    assert parse_csv_column("Header (1)") == ("1", "Header")
    assert parse_csv_column("Some Other Header   (1)") == (
        "1",
        "Some Other Header",
    )