add some comments

2023-03-23 23:05:51 +08:00
parent 4f444d4e9f
commit 2025ddcc7d
2 changed files with 55 additions and 16 deletions
--- a/anova.py
+++ b/anova.py
@@ -3,18 +3,42 @@ import pandas as pd
 from scipy.stats import f
 """
    This file needs to define the info in the *main* block, and then run the anova function.
 """
 def do_print(lst_value, str_col):
    """
    Just for friendly-looking printing
    :param lst_value:
    :param str_col:
    :return:
    """
    str_data = '\t'.join([str(round(e, 4 if 'P value' in str_col else 3)) for e in lst_value])
    print(f'{str_col}\t{str_data}')
-def anova(lst_col_seg, n_level, oa_file, result_file):
+def anova(lst_col_seg, n_level, oa_file, result_file, alpha=0.1):
    """
    Give the files and info, compute the significance of each X for each Y
    :param lst_col_seg: record the number of X, E, and Y.
    :param n_level:
    :param oa_file:
    :param result_file:
    :param alpha: significance level, usually 0.1, 0.05, 0.01
    :return:
    """
    # read and check the files
    df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6)
    df_res = pd.read_excel(result_file, sheet_name=0, engine='openpyxl', index_col=0)
    assert df_res.shape[1] == sum(lst_col_seg), 'the column number is wrong'
    assert df_oa.shape[1] == lst_col_seg[0] + lst_col_seg[1], 'the column number is wrong'
    lst_head = [f"{idx+1}_{ind_name}" for idx, ind_name in enumerate(df_res.columns)]
    # The three lines below define some coefficients for further computation
    n_col_input = lst_col_seg[0] + lst_col_seg[1]
    n_exp_row = df_res.shape[0]
    n_degree_error = n_exp_row - 1 - (n_level - 1) * lst_col_seg[0]
@@ -26,43 +50,40 @@ def anova(lst_col_seg, n_level, oa_file, result_file):
    lst_report = []
    # start to loop each Y
    for idx_col in range(lst_col_seg[2]):
        str_ind_name = lst_head[idx_col+n_col_input]
        df_y_col = df_output.iloc[:, idx_col]  # the y column
        # sum_y_2 = np.sum(np.power(df_y_col, 2))  # sum(y^2)
        df_y_col_repeated = np.tile(df_y_col, (n_col_input, 1)).T  # repeat the y column
        big_t = df_y_col.sum()  # the big T
        # generate T1, ..., T(n_levels)
-        lst_2d_big_t = []
+        lst_2d_big_t = []  # Table 1, row 10, 11, 12
        for level in range(n_level):
            arr_big_t = np.sum(df_y_col_repeated * np.where(df_oa == level, 1, 0), axis=0)
            lst_2d_big_t.append(arr_big_t.tolist())
        arr_big_t_2 = np.power(np.array(lst_2d_big_t), 2)
-        arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row
+        arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row  # Table 1, last row
        assert arr_s.size == n_col_input, 'wrong arr_s size'
        # sum_s = np.sum(arr_s)
        # so far, the first table is computed. Now, compute the second table
        df_s = pd.DataFrame(arr_s.reshape((1, n_col_input)), columns=lst_head[:n_col_input])
-        do_print(arr_s.tolist(), f'{str_ind_name}\tS')
+        do_print(arr_s.tolist(), f'{str_ind_name}\tS')  # Table 2, col 2
        df_s_non_error = df_s.iloc[:, :lst_col_seg[0]] / (n_level - 1)
        ms_of_error = df_s.iloc[:, lst_col_seg[0]:].sum().sum() / n_degree_error
-        do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS')
+        do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS')  # Table 2, col 4
        arr_f = df_s_non_error / ms_of_error
-        do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio')
+        do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio')  # Table 2, col 5
-        # alpha = 0.05
+        arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error)  # from scipy.stats import f
-        alpha = 0.1
+        do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value')  # Table 2, col 6
        arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error)
        do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value')
        lst_sig = [c for c, p in zip(lst_head[:lst_col_seg[0]], arr_p_value[0].tolist()) if p < alpha]
-        # print(df_output.columns[idx_col], arr_p_value, lst_sig)
+
        if len(lst_sig) > 0:
            lst_report.append(f"For indicator {str_ind_name}, the sig factors are {lst_sig}")
@@ -72,11 +93,11 @@ def anova(lst_col_seg, n_level, oa_file, result_file):
 if __name__ == '__main__':
    # first test
-    the_lst_col_seg = [5, 1, 6]  # 11 factors, 2 for error, and 6 indicators
+    the_lst_col_seg = [5, 1, 6]  # 11 factors (X), 2 for error (E), and 6 indicators (Y)
    the_n_level = 5
    anova(the_lst_col_seg, the_n_level, "oa25.txt", "result/experiment_result_25.xlsx")
    # second test
-    # the_lst_col_seg = [3, 1, 1]  # 11 factors, 2 for error, and 6 indicators
+    # the_lst_col_seg = [3, 1, 1]  # 3 factors, 1 for error, and 1 indicators
    # the_n_level = 3
    # anova(the_lst_col_seg, the_n_level, "oaL9-3-4.csv", "result.xlsx")
--- a/experiment.py
+++ b/experiment.py
@@ -3,12 +3,23 @@ import pandas as pd
 from env import Env
 import datetime
 """
    The experiment.py is used to read the input value in the file xv.csv according to the Orthogonal Array (OA) table 
    in the file oa25.txt.
    The file oa25 means that the number of total experiments is 25, as it has at most 6 inputs, each one has 5 levels.
    Therefore, oa25.txt has 6 columns, and the values are labelled as 0, 1, 2, 3 and 4.
    After reading these two files, the code below runs the model one-by-one (which is very ineffective), and
     uses lists to record the outputs, and save the results in the sub-folder result. 
 """
-idx_start = 1  # first index is 1 !
+# This idx_start is used when the bulk running is unexpectedly stopped, and we need to rerun the model.
 idx_start = 1  # first index is 1, not 0 ! If we read oa25.txt, then the max value is 25
 # number of runs for each experiment
 n_sample = 50
 df_xv = pd.read_csv("xv.csv", header=None, index_col=None).transpose()
 # the names of four input indicators
 lst_xv_keys = [
    'alpha',
    'percent_search',
@@ -17,13 +28,16 @@ lst_xv_keys = [
 ]
 df_xv.columns = lst_xv_keys
 # read the OA table
 df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6)
 n_row, n_col = df_oa.shape
 # these para below keep unchanged
 model_para = {
    "n_worker": 1000,
    "n_firm": 100
 }
 # defined six outputs
 lst_op_key = ['out_w_avg_salary', 'out_w_gini_salary', 'out_f_avg_profit', 'out_f_avg_yield',
              'out_f_gini_profit', 'out_w_percent_hired']
 lst_2d_op_avg = []
@@ -32,9 +46,11 @@ for idx_row in range(idx_start-1, n_row, 1):
    print(f"Running the {idx_row + 1}-th experiment at {datetime.datetime.now()}.")
    lst_value = []
    for idx_col, k in enumerate(lst_xv_keys):
        # read the corresponding value by mapping OA and xv.csv
        oa_idx_level = int(df_oa.iat[idx_row, idx_col])
        lst_value.append(df_xv.iloc[oa_idx_level][k])
    lst_2d_xv.append(lst_value)
    # merge the two dictionaries into one; send it to the AgentPy model
    dct_merge = {**model_para, **dict(zip(lst_xv_keys, lst_value))}
    # pprint.pprint(dct_merge)
    lst_2d_op = []
@@ -48,6 +64,7 @@ for idx_row in range(idx_start-1, n_row, 1):
            if not the_model.running:
                break
        # record the six outputs
        lst_2d_op.append([the_model.out_w_avg_salary, the_model.out_w_gini_salary,
                          the_model.out_f_avg_profit, the_model.out_f_avg_yield,
                          the_model.out_f_gini_profit, the_model.out_w_percent_hired])
@@ -64,5 +81,6 @@ for idx_row in range(idx_start-1, n_row, 1):
    df_final = pd.concat([df_final.reset_index(drop=True),
                          pd.DataFrame(lst_2d_op_avg, columns=lst_op_key).reset_index(drop=True)], axis=1)
    # save to excel files
    df_final.to_excel(f'result/experiment_result_{idx_row + 1}.xlsx')
    df_final.to_csv(f'result/experiment_result_{idx_row + 1}.csv')