diff --git a/anova.py b/anova.py index c86cf6d..f23b1b0 100644 --- a/anova.py +++ b/anova.py @@ -3,18 +3,42 @@ import pandas as pd from scipy.stats import f +""" + This file needs to define the info in the *main* block, and then run the anova function. +""" + + def do_print(lst_value, str_col): + """ + Just for friendly-looking printing + + :param lst_value: + :param str_col: + :return: + """ str_data = '\t'.join([str(round(e, 4 if 'P value' in str_col else 3)) for e in lst_value]) print(f'{str_col}\t{str_data}') -def anova(lst_col_seg, n_level, oa_file, result_file): +def anova(lst_col_seg, n_level, oa_file, result_file, alpha=0.1): + """ + Give the files and info, compute the significance of each X for each Y + + :param lst_col_seg: record the number of X, E, and Y. + :param n_level: + :param oa_file: + :param result_file: + :param alpha: significance level, usually 0.1, 0.05, 0.01 + :return: + """ + # read and check the files df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6) df_res = pd.read_excel(result_file, sheet_name=0, engine='openpyxl', index_col=0) assert df_res.shape[1] == sum(lst_col_seg), 'the column number is wrong' assert df_oa.shape[1] == lst_col_seg[0] + lst_col_seg[1], 'the column number is wrong' lst_head = [f"{idx+1}_{ind_name}" for idx, ind_name in enumerate(df_res.columns)] + # The three lines below define some coefficients for further computation n_col_input = lst_col_seg[0] + lst_col_seg[1] n_exp_row = df_res.shape[0] n_degree_error = n_exp_row - 1 - (n_level - 1) * lst_col_seg[0] @@ -26,43 +50,40 @@ def anova(lst_col_seg, n_level, oa_file, result_file): lst_report = [] + # start to loop each Y for idx_col in range(lst_col_seg[2]): str_ind_name = lst_head[idx_col+n_col_input] df_y_col = df_output.iloc[:, idx_col] # the y column - # sum_y_2 = np.sum(np.power(df_y_col, 2)) # sum(y^2) df_y_col_repeated = np.tile(df_y_col, (n_col_input, 1)).T # repeat the y column big_t = df_y_col.sum() # the big T # generate T1, ..., T(n_levels) - lst_2d_big_t = [] + lst_2d_big_t = [] # Table 1, row 10, 11, 12 for level in range(n_level): arr_big_t = np.sum(df_y_col_repeated * np.where(df_oa == level, 1, 0), axis=0) lst_2d_big_t.append(arr_big_t.tolist()) arr_big_t_2 = np.power(np.array(lst_2d_big_t), 2) - arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row + arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row # Table 1, last row assert arr_s.size == n_col_input, 'wrong arr_s size' - # sum_s = np.sum(arr_s) # so far, the first table is computed. Now, compute the second table df_s = pd.DataFrame(arr_s.reshape((1, n_col_input)), columns=lst_head[:n_col_input]) - do_print(arr_s.tolist(), f'{str_ind_name}\tS') + do_print(arr_s.tolist(), f'{str_ind_name}\tS') # Table 2, col 2 df_s_non_error = df_s.iloc[:, :lst_col_seg[0]] / (n_level - 1) ms_of_error = df_s.iloc[:, lst_col_seg[0]:].sum().sum() / n_degree_error - do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS') + do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS') # Table 2, col 4 arr_f = df_s_non_error / ms_of_error - do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio') + do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio') # Table 2, col 5 - # alpha = 0.05 - alpha = 0.1 - arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error) - do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value') + arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error) # from scipy.stats import f + do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value') # Table 2, col 6 lst_sig = [c for c, p in zip(lst_head[:lst_col_seg[0]], arr_p_value[0].tolist()) if p < alpha] - # print(df_output.columns[idx_col], arr_p_value, lst_sig) + if len(lst_sig) > 0: lst_report.append(f"For indicator {str_ind_name}, the sig factors are {lst_sig}") @@ -72,11 +93,11 @@ def anova(lst_col_seg, n_level, oa_file, result_file): if __name__ == '__main__': # first test - the_lst_col_seg = [5, 1, 6] # 11 factors, 2 for error, and 6 indicators + the_lst_col_seg = [5, 1, 6] # 11 factors (X), 2 for error (E), and 6 indicators (Y) the_n_level = 5 anova(the_lst_col_seg, the_n_level, "oa25.txt", "result/experiment_result_25.xlsx") # second test - # the_lst_col_seg = [3, 1, 1] # 11 factors, 2 for error, and 6 indicators + # the_lst_col_seg = [3, 1, 1] # 3 factors, 1 for error, and 1 indicators # the_n_level = 3 # anova(the_lst_col_seg, the_n_level, "oaL9-3-4.csv", "result.xlsx") diff --git a/experiment.py b/experiment.py index d4f8f75..1d261d5 100644 --- a/experiment.py +++ b/experiment.py @@ -3,12 +3,23 @@ import pandas as pd from env import Env import datetime +""" + The experiment.py is used to read the input value in the file xv.csv according to the Orthogonal Array (OA) table + in the file oa25.txt. + The file oa25 means that the number of total experiments is 25, as it has at most 6 inputs, each one has 5 levels. + Therefore, oa25.txt has 6 columns, and the values are labelled as 0, 1, 2, 3 and 4. + After reading these two files, the code below runs the model one-by-one (which is very ineffective), and + uses lists to record the outputs, and save the results in the sub-folder result. +""" -idx_start = 1 # first index is 1 ! +# This idx_start is used when the bulk running is unexpectedly stopped, and we need to rerun the model. +idx_start = 1 # first index is 1, not 0 ! If we read oa25.txt, then the max value is 25 +# number of runs for each experiment n_sample = 50 df_xv = pd.read_csv("xv.csv", header=None, index_col=None).transpose() +# the names of four input indicators lst_xv_keys = [ 'alpha', 'percent_search', @@ -17,13 +28,16 @@ lst_xv_keys = [ ] df_xv.columns = lst_xv_keys +# read the OA table df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6) n_row, n_col = df_oa.shape +# these para below keep unchanged model_para = { "n_worker": 1000, "n_firm": 100 } +# defined six outputs lst_op_key = ['out_w_avg_salary', 'out_w_gini_salary', 'out_f_avg_profit', 'out_f_avg_yield', 'out_f_gini_profit', 'out_w_percent_hired'] lst_2d_op_avg = [] @@ -32,9 +46,11 @@ for idx_row in range(idx_start-1, n_row, 1): print(f"Running the {idx_row + 1}-th experiment at {datetime.datetime.now()}.") lst_value = [] for idx_col, k in enumerate(lst_xv_keys): + # read the corresponding value by mapping OA and xv.csv oa_idx_level = int(df_oa.iat[idx_row, idx_col]) lst_value.append(df_xv.iloc[oa_idx_level][k]) lst_2d_xv.append(lst_value) + # merge the two dictionaries into one; send it to the AgentPy model dct_merge = {**model_para, **dict(zip(lst_xv_keys, lst_value))} # pprint.pprint(dct_merge) lst_2d_op = [] @@ -48,6 +64,7 @@ for idx_row in range(idx_start-1, n_row, 1): if not the_model.running: break + # record the six outputs lst_2d_op.append([the_model.out_w_avg_salary, the_model.out_w_gini_salary, the_model.out_f_avg_profit, the_model.out_f_avg_yield, the_model.out_f_gini_profit, the_model.out_w_percent_hired]) @@ -64,5 +81,6 @@ for idx_row in range(idx_start-1, n_row, 1): df_final = pd.concat([df_final.reset_index(drop=True), pd.DataFrame(lst_2d_op_avg, columns=lst_op_key).reset_index(drop=True)], axis=1) + # save to excel files df_final.to_excel(f'result/experiment_result_{idx_row + 1}.xlsx') df_final.to_csv(f'result/experiment_result_{idx_row + 1}.csv')