IIabm/anova.py

import numpy as np
import pandas as pd
from orm import engine
from scipy.stats import f


"""
    This file needs to define the info in the *main* block,
    and then run the anova function.
"""


def do_print(lst_value, str_col):
    """
    Just for friendly-looking printing

    :param lst_value:
    :param str_col:
    :return:
    """
    str_data = '\t'.join(
        [str(round(e, 4 if 'P value' in str_col else 3)) for e in lst_value])
    print(f'{str_col}\t{str_data}')


def anova(lst_col_seg, n_level, oa_file, result_file, alpha=0.1):
    """
    Give the files and info, compute the significance of each X for each Y

    :param lst_col_seg: record the number of X, E, and Y.
    :param n_level:
    :param oa_file:
    :param result_file:
    :param alpha: significance level, usually 0.1, 0.05, 0.01
    :return:
    """
    # read and check the files
    df_oa = pd.read_csv("oa_with_exp.csv", index_col=None)
    df_res = result_file
    assert df_res.shape[1] == sum(lst_col_seg), 'the column number is wrong'
    assert df_oa.shape[1] == lst_col_seg[0] + \
        lst_col_seg[1], 'the column number is wrong'
    lst_head = [f"{idx+1}_{ind_name}" for idx,
                ind_name in enumerate(df_res.columns)]

    # The three lines below define some coefficients for further computation
    n_col_input = lst_col_seg[0] + lst_col_seg[1]
    n_exp_row = df_res.shape[0]
    n_degree_error = n_exp_row - 1 - (n_level - 1) * lst_col_seg[0]

    df_output = df_res.iloc[:, n_col_input:]

    print("Source\tSource\t" + '\t'.join(lst_head[:lst_col_seg[0]]) + "\te")
    print("DOF\tDOF\t" + '\t'.join([str(n_level-1)]
          * lst_col_seg[0]) + f"\t{n_degree_error}")

    lst_report = []

    # start to loop each Y
    for idx_col in range(lst_col_seg[2]):
        str_ind_name = lst_head[idx_col+n_col_input]

        df_y_col = df_output.iloc[:, idx_col]  # the y column
        df_y_col_repeated = np.tile(
            df_y_col, (n_col_input, 1)).T  # repeat the y column
        big_t = df_y_col.sum()  # the big T

        # generate T1, ..., T(n_levels)
        lst_2d_big_t = []  # Table 1, row 10, 11, 12
        for level in range(n_level):
            arr_big_t = np.sum(df_y_col_repeated *
                               np.where(df_oa == level, 1, 0), axis=0)
            lst_2d_big_t.append(arr_big_t.tolist())
        arr_big_t_2 = np.power(np.array(lst_2d_big_t), 2)
        arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - \
            big_t * big_t / n_exp_row  # Table 1, last row
        assert arr_s.size == n_col_input, 'wrong arr_s size'

        # so far, the first table is computed. Now, compute the second table
        df_s = pd.DataFrame(arr_s.reshape((1, n_col_input)),
                            columns=lst_head[:n_col_input])
        do_print(arr_s.tolist(), f'{str_ind_name}\tS')  # Table 2, col 2

        df_s_non_error = df_s.iloc[:, :lst_col_seg[0]] / (n_level - 1)
        ms_of_error = \
            df_s.iloc[:, lst_col_seg[0]:].sum().sum() / n_degree_error

        do_print(df_s_non_error.values.tolist()[
                 0] + [ms_of_error], f'{str_ind_name}\tMS')  # Table 2, col 4

        arr_f = df_s_non_error / ms_of_error
        # Table 2, col 5
        do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio')

        # from scipy.stats import f
        arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error)
        # Table 2, col 6
        do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value')

        lst_sig = [c for c, p in zip(
            lst_head[:lst_col_seg[0]], arr_p_value[0].tolist()) if p < alpha]

        if len(lst_sig) > 0:
            lst_report.append(
                f"For indicator {str_ind_name}, the sig factors are {lst_sig}")

    for s in lst_report:
        print(s)


if __name__ == '__main__':
    # prep data
    str_sql = """
    select * from
    (select distinct idx_scenario, n_max_trial, crit_supplier,
    firm_pref_request, firm_pref_accept, netw_pref_cust_n,
    netw_pref_cust_size, cap_limit, diff_new_conn, diff_remove
    from iiabmdb.with_exp_experiment) as a
    inner join
    (
    select idx_scenario,
    sum(n_disrupt_s) as n_disrupt_s, sum(n_disrupt_t) as n_disrupt_t from
    iiabmdb.with_exp_experiment as a
    inner join
    (
    select e_id, count(n_s_disrupt_t) as n_disrupt_s,
    sum(n_s_disrupt_t) as n_disrupt_t from
    iiabmdb.with_exp_sample as a
    inner join
    (select a.s_id as s_id, count(id) as n_s_disrupt_t from
    iiabmdb.with_exp_result as a
    inner join
    (select distinct s_id from iiabmdb.with_exp_result where ts > 0) as b
    on a.s_id = b.s_id
    group by s_id
    ) as b
    on a.id = b.s_id
    group by e_id
    ) as b
    on a.id = b.e_id
    group by idx_scenario) as b
    on a.idx_scenario = b.idx_scenario;

    """
    result = pd.read_sql(sql=str_sql,
                         con=engine)
    result.drop('idx_scenario', 1, inplace=True)
    df_oa = pd.read_csv("oa_with_exp.csv", index_col=None)
    result = pd.concat(
        [result.iloc[:, 0:9], df_oa.iloc[:, -4:], result.iloc[:, -2:]], axis=1)
    result.to_csv('analysis\\experiment_result.csv')

    # 9 factors (X), 4 for error (E), and 2 indicators (Y)
    the_lst_col_seg = [9, 4, 2]
    the_n_level = 3
    anova(the_lst_col_seg, the_n_level, "oa25.txt", result, 0.1)
anova 2023-05-22 17:25:10 +08:00			`import numpy as np`
			`import pandas as pd`
			`from orm import engine`
			`from scipy.stats import f`


			`"""`
			`This file needs to define the info in the main block,`
			`and then run the anova function.`
			`"""`


			`def do_print(lst_value, str_col):`
			`"""`
			`Just for friendly-looking printing`

			`:param lst_value:`
			`:param str_col:`
			`:return:`
			`"""`
			`str_data = '\t'.join(`
			`[str(round(e, 4 if 'P value' in str_col else 3)) for e in lst_value])`
			`print(f'{str_col}\t{str_data}')`


			`def anova(lst_col_seg, n_level, oa_file, result_file, alpha=0.1):`
			`"""`
			`Give the files and info, compute the significance of each X for each Y`

			`:param lst_col_seg: record the number of X, E, and Y.`
			`:param n_level:`
			`:param oa_file:`
			`:param result_file:`
			`:param alpha: significance level, usually 0.1, 0.05, 0.01`
			`:return:`
			`"""`
			`# read and check the files`
			`df_oa = pd.read_csv("oa_with_exp.csv", index_col=None)`
			`df_res = result_file`
			`assert df_res.shape[1] == sum(lst_col_seg), 'the column number is wrong'`
			`assert df_oa.shape[1] == lst_col_seg[0] + \`
			`lst_col_seg[1], 'the column number is wrong'`
			`lst_head = [f"{idx+1}_{ind_name}" for idx,`
			`ind_name in enumerate(df_res.columns)]`

			`# The three lines below define some coefficients for further computation`
			`n_col_input = lst_col_seg[0] + lst_col_seg[1]`
			`n_exp_row = df_res.shape[0]`
			`n_degree_error = n_exp_row - 1 - (n_level - 1) * lst_col_seg[0]`

			`df_output = df_res.iloc[:, n_col_input:]`

			`print("Source\tSource\t" + '\t'.join(lst_head[:lst_col_seg[0]]) + "\te")`
			`print("DOF\tDOF\t" + '\t'.join([str(n_level-1)]`
			`* lst_col_seg[0]) + f"\t{n_degree_error}")`

			`lst_report = []`

			`# start to loop each Y`
			`for idx_col in range(lst_col_seg[2]):`
			`str_ind_name = lst_head[idx_col+n_col_input]`

			`df_y_col = df_output.iloc[:, idx_col] # the y column`
			`df_y_col_repeated = np.tile(`
			`df_y_col, (n_col_input, 1)).T # repeat the y column`
			`big_t = df_y_col.sum() # the big T`

			`# generate T1, ..., T(n_levels)`
			`lst_2d_big_t = [] # Table 1, row 10, 11, 12`
			`for level in range(n_level):`
			`arr_big_t = np.sum(df_y_col_repeated *`
			`np.where(df_oa == level, 1, 0), axis=0)`
			`lst_2d_big_t.append(arr_big_t.tolist())`
			`arr_big_t_2 = np.power(np.array(lst_2d_big_t), 2)`
			`arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - \`
			`big_t * big_t / n_exp_row # Table 1, last row`
			`assert arr_s.size == n_col_input, 'wrong arr_s size'`

			`# so far, the first table is computed. Now, compute the second table`
			`df_s = pd.DataFrame(arr_s.reshape((1, n_col_input)),`
			`columns=lst_head[:n_col_input])`
			`do_print(arr_s.tolist(), f'{str_ind_name}\tS') # Table 2, col 2`

			`df_s_non_error = df_s.iloc[:, :lst_col_seg[0]] / (n_level - 1)`
			`ms_of_error = \`
			`df_s.iloc[:, lst_col_seg[0]:].sum().sum() / n_degree_error`

			`do_print(df_s_non_error.values.tolist()[`
			`0] + [ms_of_error], f'{str_ind_name}\tMS') # Table 2, col 4`

			`arr_f = df_s_non_error / ms_of_error`
			`# Table 2, col 5`
			`do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio')`

			`# from scipy.stats import f`
			`arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error)`
			`# Table 2, col 6`
			`do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value')`

			`lst_sig = [c for c, p in zip(`
			`lst_head[:lst_col_seg[0]], arr_p_value[0].tolist()) if p < alpha]`

			`if len(lst_sig) > 0:`
			`lst_report.append(`
			`f"For indicator {str_ind_name}, the sig factors are {lst_sig}")`

			`for s in lst_report:`
			`print(s)`


			`if __name__ == '__main__':`
			`# prep data`
			`str_sql = """`
			`select * from`
			`(select distinct idx_scenario, n_max_trial, crit_supplier,`
			`firm_pref_request, firm_pref_accept, netw_pref_cust_n,`
			`netw_pref_cust_size, cap_limit, diff_new_conn, diff_remove`
			`from iiabmdb.with_exp_experiment) as a`
			`inner join`
			`(`
			`select idx_scenario,`
			`sum(n_disrupt_s) as n_disrupt_s, sum(n_disrupt_t) as n_disrupt_t from`
			`iiabmdb.with_exp_experiment as a`
			`inner join`
			`(`
			`select e_id, count(n_s_disrupt_t) as n_disrupt_s,`
			`sum(n_s_disrupt_t) as n_disrupt_t from`
			`iiabmdb.with_exp_sample as a`
			`inner join`
			`(select a.s_id as s_id, count(id) as n_s_disrupt_t from`
			`iiabmdb.with_exp_result as a`
			`inner join`
			`(select distinct s_id from iiabmdb.with_exp_result where ts > 0) as b`
			`on a.s_id = b.s_id`
			`group by s_id`
			`) as b`
			`on a.id = b.s_id`
			`group by e_id`
			`) as b`
			`on a.id = b.e_id`
			`group by idx_scenario) as b`
			`on a.idx_scenario = b.idx_scenario;`

			`"""`
			`result = pd.read_sql(sql=str_sql,`
			`con=engine)`
			`result.drop('idx_scenario', 1, inplace=True)`
			`df_oa = pd.read_csv("oa_with_exp.csv", index_col=None)`
			`result = pd.concat(`
			`[result.iloc[:, 0:9], df_oa.iloc[:, -4:], result.iloc[:, -2:]], axis=1)`
			`result.to_csv('analysis\\experiment_result.csv')`

			`# 9 factors (X), 4 for error (E), and 2 indicators (Y)`
			`the_lst_col_seg = [9, 4, 2]`
			`the_n_level = 3`
			`anova(the_lst_col_seg, the_n_level, "oa25.txt", result, 0.1)`