add some comments
This commit is contained in:
parent
4f444d4e9f
commit
2025ddcc7d
51
anova.py
51
anova.py
|
@ -3,18 +3,42 @@ import pandas as pd
|
|||
from scipy.stats import f
|
||||
|
||||
|
||||
"""
|
||||
This file needs to define the info in the *main* block, and then run the anova function.
|
||||
"""
|
||||
|
||||
|
||||
def do_print(lst_value, str_col):
|
||||
"""
|
||||
Just for friendly-looking printing
|
||||
|
||||
:param lst_value:
|
||||
:param str_col:
|
||||
:return:
|
||||
"""
|
||||
str_data = '\t'.join([str(round(e, 4 if 'P value' in str_col else 3)) for e in lst_value])
|
||||
print(f'{str_col}\t{str_data}')
|
||||
|
||||
|
||||
def anova(lst_col_seg, n_level, oa_file, result_file):
|
||||
def anova(lst_col_seg, n_level, oa_file, result_file, alpha=0.1):
|
||||
"""
|
||||
Give the files and info, compute the significance of each X for each Y
|
||||
|
||||
:param lst_col_seg: record the number of X, E, and Y.
|
||||
:param n_level:
|
||||
:param oa_file:
|
||||
:param result_file:
|
||||
:param alpha: significance level, usually 0.1, 0.05, 0.01
|
||||
:return:
|
||||
"""
|
||||
# read and check the files
|
||||
df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6)
|
||||
df_res = pd.read_excel(result_file, sheet_name=0, engine='openpyxl', index_col=0)
|
||||
assert df_res.shape[1] == sum(lst_col_seg), 'the column number is wrong'
|
||||
assert df_oa.shape[1] == lst_col_seg[0] + lst_col_seg[1], 'the column number is wrong'
|
||||
lst_head = [f"{idx+1}_{ind_name}" for idx, ind_name in enumerate(df_res.columns)]
|
||||
|
||||
# The three lines below define some coefficients for further computation
|
||||
n_col_input = lst_col_seg[0] + lst_col_seg[1]
|
||||
n_exp_row = df_res.shape[0]
|
||||
n_degree_error = n_exp_row - 1 - (n_level - 1) * lst_col_seg[0]
|
||||
|
@ -26,43 +50,40 @@ def anova(lst_col_seg, n_level, oa_file, result_file):
|
|||
|
||||
lst_report = []
|
||||
|
||||
# start to loop each Y
|
||||
for idx_col in range(lst_col_seg[2]):
|
||||
str_ind_name = lst_head[idx_col+n_col_input]
|
||||
|
||||
df_y_col = df_output.iloc[:, idx_col] # the y column
|
||||
# sum_y_2 = np.sum(np.power(df_y_col, 2)) # sum(y^2)
|
||||
df_y_col_repeated = np.tile(df_y_col, (n_col_input, 1)).T # repeat the y column
|
||||
big_t = df_y_col.sum() # the big T
|
||||
|
||||
# generate T1, ..., T(n_levels)
|
||||
lst_2d_big_t = []
|
||||
lst_2d_big_t = [] # Table 1, row 10, 11, 12
|
||||
for level in range(n_level):
|
||||
arr_big_t = np.sum(df_y_col_repeated * np.where(df_oa == level, 1, 0), axis=0)
|
||||
lst_2d_big_t.append(arr_big_t.tolist())
|
||||
arr_big_t_2 = np.power(np.array(lst_2d_big_t), 2)
|
||||
arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row
|
||||
arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row # Table 1, last row
|
||||
assert arr_s.size == n_col_input, 'wrong arr_s size'
|
||||
# sum_s = np.sum(arr_s)
|
||||
|
||||
# so far, the first table is computed. Now, compute the second table
|
||||
df_s = pd.DataFrame(arr_s.reshape((1, n_col_input)), columns=lst_head[:n_col_input])
|
||||
do_print(arr_s.tolist(), f'{str_ind_name}\tS')
|
||||
do_print(arr_s.tolist(), f'{str_ind_name}\tS') # Table 2, col 2
|
||||
|
||||
df_s_non_error = df_s.iloc[:, :lst_col_seg[0]] / (n_level - 1)
|
||||
ms_of_error = df_s.iloc[:, lst_col_seg[0]:].sum().sum() / n_degree_error
|
||||
|
||||
do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS')
|
||||
do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS') # Table 2, col 4
|
||||
|
||||
arr_f = df_s_non_error / ms_of_error
|
||||
do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio')
|
||||
do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio') # Table 2, col 5
|
||||
|
||||
# alpha = 0.05
|
||||
alpha = 0.1
|
||||
arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error)
|
||||
do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value')
|
||||
arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error) # from scipy.stats import f
|
||||
do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value') # Table 2, col 6
|
||||
|
||||
lst_sig = [c for c, p in zip(lst_head[:lst_col_seg[0]], arr_p_value[0].tolist()) if p < alpha]
|
||||
# print(df_output.columns[idx_col], arr_p_value, lst_sig)
|
||||
|
||||
if len(lst_sig) > 0:
|
||||
lst_report.append(f"For indicator {str_ind_name}, the sig factors are {lst_sig}")
|
||||
|
||||
|
@ -72,11 +93,11 @@ def anova(lst_col_seg, n_level, oa_file, result_file):
|
|||
|
||||
if __name__ == '__main__':
|
||||
# first test
|
||||
the_lst_col_seg = [5, 1, 6] # 11 factors, 2 for error, and 6 indicators
|
||||
the_lst_col_seg = [5, 1, 6] # 11 factors (X), 2 for error (E), and 6 indicators (Y)
|
||||
the_n_level = 5
|
||||
anova(the_lst_col_seg, the_n_level, "oa25.txt", "result/experiment_result_25.xlsx")
|
||||
|
||||
# second test
|
||||
# the_lst_col_seg = [3, 1, 1] # 11 factors, 2 for error, and 6 indicators
|
||||
# the_lst_col_seg = [3, 1, 1] # 3 factors, 1 for error, and 1 indicators
|
||||
# the_n_level = 3
|
||||
# anova(the_lst_col_seg, the_n_level, "oaL9-3-4.csv", "result.xlsx")
|
||||
|
|
|
@ -3,12 +3,23 @@ import pandas as pd
|
|||
from env import Env
|
||||
import datetime
|
||||
|
||||
"""
|
||||
The experiment.py is used to read the input value in the file xv.csv according to the Orthogonal Array (OA) table
|
||||
in the file oa25.txt.
|
||||
The file oa25 means that the number of total experiments is 25, as it has at most 6 inputs, each one has 5 levels.
|
||||
Therefore, oa25.txt has 6 columns, and the values are labelled as 0, 1, 2, 3 and 4.
|
||||
After reading these two files, the code below runs the model one-by-one (which is very ineffective), and
|
||||
uses lists to record the outputs, and save the results in the sub-folder result.
|
||||
"""
|
||||
|
||||
idx_start = 1 # first index is 1 !
|
||||
# This idx_start is used when the bulk running is unexpectedly stopped, and we need to rerun the model.
|
||||
idx_start = 1 # first index is 1, not 0 ! If we read oa25.txt, then the max value is 25
|
||||
|
||||
# number of runs for each experiment
|
||||
n_sample = 50
|
||||
|
||||
df_xv = pd.read_csv("xv.csv", header=None, index_col=None).transpose()
|
||||
# the names of four input indicators
|
||||
lst_xv_keys = [
|
||||
'alpha',
|
||||
'percent_search',
|
||||
|
@ -17,13 +28,16 @@ lst_xv_keys = [
|
|||
]
|
||||
df_xv.columns = lst_xv_keys
|
||||
|
||||
# read the OA table
|
||||
df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6)
|
||||
n_row, n_col = df_oa.shape
|
||||
# these para below keep unchanged
|
||||
model_para = {
|
||||
"n_worker": 1000,
|
||||
"n_firm": 100
|
||||
}
|
||||
|
||||
# defined six outputs
|
||||
lst_op_key = ['out_w_avg_salary', 'out_w_gini_salary', 'out_f_avg_profit', 'out_f_avg_yield',
|
||||
'out_f_gini_profit', 'out_w_percent_hired']
|
||||
lst_2d_op_avg = []
|
||||
|
@ -32,9 +46,11 @@ for idx_row in range(idx_start-1, n_row, 1):
|
|||
print(f"Running the {idx_row + 1}-th experiment at {datetime.datetime.now()}.")
|
||||
lst_value = []
|
||||
for idx_col, k in enumerate(lst_xv_keys):
|
||||
# read the corresponding value by mapping OA and xv.csv
|
||||
oa_idx_level = int(df_oa.iat[idx_row, idx_col])
|
||||
lst_value.append(df_xv.iloc[oa_idx_level][k])
|
||||
lst_2d_xv.append(lst_value)
|
||||
# merge the two dictionaries into one; send it to the AgentPy model
|
||||
dct_merge = {**model_para, **dict(zip(lst_xv_keys, lst_value))}
|
||||
# pprint.pprint(dct_merge)
|
||||
lst_2d_op = []
|
||||
|
@ -48,6 +64,7 @@ for idx_row in range(idx_start-1, n_row, 1):
|
|||
if not the_model.running:
|
||||
break
|
||||
|
||||
# record the six outputs
|
||||
lst_2d_op.append([the_model.out_w_avg_salary, the_model.out_w_gini_salary,
|
||||
the_model.out_f_avg_profit, the_model.out_f_avg_yield,
|
||||
the_model.out_f_gini_profit, the_model.out_w_percent_hired])
|
||||
|
@ -64,5 +81,6 @@ for idx_row in range(idx_start-1, n_row, 1):
|
|||
df_final = pd.concat([df_final.reset_index(drop=True),
|
||||
pd.DataFrame(lst_2d_op_avg, columns=lst_op_key).reset_index(drop=True)], axis=1)
|
||||
|
||||
# save to excel files
|
||||
df_final.to_excel(f'result/experiment_result_{idx_row + 1}.xlsx')
|
||||
df_final.to_csv(f'result/experiment_result_{idx_row + 1}.csv')
|
||||
|
|
Loading…
Reference in New Issue