add some comments

This commit is contained in:
He Zhou 2023-03-23 23:05:51 +08:00
parent 4f444d4e9f
commit 2025ddcc7d
2 changed files with 55 additions and 16 deletions

View File

@ -3,18 +3,42 @@ import pandas as pd
from scipy.stats import f from scipy.stats import f
"""
This file needs to define the info in the *main* block, and then run the anova function.
"""
def do_print(lst_value, str_col): def do_print(lst_value, str_col):
"""
Just for friendly-looking printing
:param lst_value:
:param str_col:
:return:
"""
str_data = '\t'.join([str(round(e, 4 if 'P value' in str_col else 3)) for e in lst_value]) str_data = '\t'.join([str(round(e, 4 if 'P value' in str_col else 3)) for e in lst_value])
print(f'{str_col}\t{str_data}') print(f'{str_col}\t{str_data}')
def anova(lst_col_seg, n_level, oa_file, result_file): def anova(lst_col_seg, n_level, oa_file, result_file, alpha=0.1):
"""
Give the files and info, compute the significance of each X for each Y
:param lst_col_seg: record the number of X, E, and Y.
:param n_level:
:param oa_file:
:param result_file:
:param alpha: significance level, usually 0.1, 0.05, 0.01
:return:
"""
# read and check the files
df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6) df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6)
df_res = pd.read_excel(result_file, sheet_name=0, engine='openpyxl', index_col=0) df_res = pd.read_excel(result_file, sheet_name=0, engine='openpyxl', index_col=0)
assert df_res.shape[1] == sum(lst_col_seg), 'the column number is wrong' assert df_res.shape[1] == sum(lst_col_seg), 'the column number is wrong'
assert df_oa.shape[1] == lst_col_seg[0] + lst_col_seg[1], 'the column number is wrong' assert df_oa.shape[1] == lst_col_seg[0] + lst_col_seg[1], 'the column number is wrong'
lst_head = [f"{idx+1}_{ind_name}" for idx, ind_name in enumerate(df_res.columns)] lst_head = [f"{idx+1}_{ind_name}" for idx, ind_name in enumerate(df_res.columns)]
# The three lines below define some coefficients for further computation
n_col_input = lst_col_seg[0] + lst_col_seg[1] n_col_input = lst_col_seg[0] + lst_col_seg[1]
n_exp_row = df_res.shape[0] n_exp_row = df_res.shape[0]
n_degree_error = n_exp_row - 1 - (n_level - 1) * lst_col_seg[0] n_degree_error = n_exp_row - 1 - (n_level - 1) * lst_col_seg[0]
@ -26,43 +50,40 @@ def anova(lst_col_seg, n_level, oa_file, result_file):
lst_report = [] lst_report = []
# start to loop each Y
for idx_col in range(lst_col_seg[2]): for idx_col in range(lst_col_seg[2]):
str_ind_name = lst_head[idx_col+n_col_input] str_ind_name = lst_head[idx_col+n_col_input]
df_y_col = df_output.iloc[:, idx_col] # the y column df_y_col = df_output.iloc[:, idx_col] # the y column
# sum_y_2 = np.sum(np.power(df_y_col, 2)) # sum(y^2)
df_y_col_repeated = np.tile(df_y_col, (n_col_input, 1)).T # repeat the y column df_y_col_repeated = np.tile(df_y_col, (n_col_input, 1)).T # repeat the y column
big_t = df_y_col.sum() # the big T big_t = df_y_col.sum() # the big T
# generate T1, ..., T(n_levels) # generate T1, ..., T(n_levels)
lst_2d_big_t = [] lst_2d_big_t = [] # Table 1, row 10, 11, 12
for level in range(n_level): for level in range(n_level):
arr_big_t = np.sum(df_y_col_repeated * np.where(df_oa == level, 1, 0), axis=0) arr_big_t = np.sum(df_y_col_repeated * np.where(df_oa == level, 1, 0), axis=0)
lst_2d_big_t.append(arr_big_t.tolist()) lst_2d_big_t.append(arr_big_t.tolist())
arr_big_t_2 = np.power(np.array(lst_2d_big_t), 2) arr_big_t_2 = np.power(np.array(lst_2d_big_t), 2)
arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row arr_s = np.sum(arr_big_t_2, axis=0) / (n_exp_row / n_level) - big_t * big_t / n_exp_row # Table 1, last row
assert arr_s.size == n_col_input, 'wrong arr_s size' assert arr_s.size == n_col_input, 'wrong arr_s size'
# sum_s = np.sum(arr_s)
# so far, the first table is computed. Now, compute the second table # so far, the first table is computed. Now, compute the second table
df_s = pd.DataFrame(arr_s.reshape((1, n_col_input)), columns=lst_head[:n_col_input]) df_s = pd.DataFrame(arr_s.reshape((1, n_col_input)), columns=lst_head[:n_col_input])
do_print(arr_s.tolist(), f'{str_ind_name}\tS') do_print(arr_s.tolist(), f'{str_ind_name}\tS') # Table 2, col 2
df_s_non_error = df_s.iloc[:, :lst_col_seg[0]] / (n_level - 1) df_s_non_error = df_s.iloc[:, :lst_col_seg[0]] / (n_level - 1)
ms_of_error = df_s.iloc[:, lst_col_seg[0]:].sum().sum() / n_degree_error ms_of_error = df_s.iloc[:, lst_col_seg[0]:].sum().sum() / n_degree_error
do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS') do_print(df_s_non_error.values.tolist()[0] + [ms_of_error], f'{str_ind_name}\tMS') # Table 2, col 4
arr_f = df_s_non_error / ms_of_error arr_f = df_s_non_error / ms_of_error
do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio') do_print(arr_f.values.tolist()[0], f'{str_ind_name}\tF ratio') # Table 2, col 5
# alpha = 0.05 arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error) # from scipy.stats import f
alpha = 0.1 do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value') # Table 2, col 6
arr_p_value = f.sf(arr_f, n_level - 1, n_degree_error)
do_print(arr_p_value.tolist()[0], f'{str_ind_name}\tP value')
lst_sig = [c for c, p in zip(lst_head[:lst_col_seg[0]], arr_p_value[0].tolist()) if p < alpha] lst_sig = [c for c, p in zip(lst_head[:lst_col_seg[0]], arr_p_value[0].tolist()) if p < alpha]
# print(df_output.columns[idx_col], arr_p_value, lst_sig)
if len(lst_sig) > 0: if len(lst_sig) > 0:
lst_report.append(f"For indicator {str_ind_name}, the sig factors are {lst_sig}") lst_report.append(f"For indicator {str_ind_name}, the sig factors are {lst_sig}")
@ -72,11 +93,11 @@ def anova(lst_col_seg, n_level, oa_file, result_file):
if __name__ == '__main__': if __name__ == '__main__':
# first test # first test
the_lst_col_seg = [5, 1, 6] # 11 factors, 2 for error, and 6 indicators the_lst_col_seg = [5, 1, 6] # 11 factors (X), 2 for error (E), and 6 indicators (Y)
the_n_level = 5 the_n_level = 5
anova(the_lst_col_seg, the_n_level, "oa25.txt", "result/experiment_result_25.xlsx") anova(the_lst_col_seg, the_n_level, "oa25.txt", "result/experiment_result_25.xlsx")
# second test # second test
# the_lst_col_seg = [3, 1, 1] # 11 factors, 2 for error, and 6 indicators # the_lst_col_seg = [3, 1, 1] # 3 factors, 1 for error, and 1 indicators
# the_n_level = 3 # the_n_level = 3
# anova(the_lst_col_seg, the_n_level, "oaL9-3-4.csv", "result.xlsx") # anova(the_lst_col_seg, the_n_level, "oaL9-3-4.csv", "result.xlsx")

View File

@ -3,12 +3,23 @@ import pandas as pd
from env import Env from env import Env
import datetime import datetime
"""
The experiment.py is used to read the input value in the file xv.csv according to the Orthogonal Array (OA) table
in the file oa25.txt.
The file oa25 means that the number of total experiments is 25, as it has at most 6 inputs, each one has 5 levels.
Therefore, oa25.txt has 6 columns, and the values are labelled as 0, 1, 2, 3 and 4.
After reading these two files, the code below runs the model one-by-one (which is very ineffective), and
uses lists to record the outputs, and save the results in the sub-folder result.
"""
idx_start = 1 # first index is 1 ! # This idx_start is used when the bulk running is unexpectedly stopped, and we need to rerun the model.
idx_start = 1 # first index is 1, not 0 ! If we read oa25.txt, then the max value is 25
# number of runs for each experiment
n_sample = 50 n_sample = 50
df_xv = pd.read_csv("xv.csv", header=None, index_col=None).transpose() df_xv = pd.read_csv("xv.csv", header=None, index_col=None).transpose()
# the names of four input indicators
lst_xv_keys = [ lst_xv_keys = [
'alpha', 'alpha',
'percent_search', 'percent_search',
@ -17,13 +28,16 @@ lst_xv_keys = [
] ]
df_xv.columns = lst_xv_keys df_xv.columns = lst_xv_keys
# read the OA table
df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6) df_oa = pd.read_fwf("oa25.txt", header=None, widths=[1]*6)
n_row, n_col = df_oa.shape n_row, n_col = df_oa.shape
# these para below keep unchanged
model_para = { model_para = {
"n_worker": 1000, "n_worker": 1000,
"n_firm": 100 "n_firm": 100
} }
# defined six outputs
lst_op_key = ['out_w_avg_salary', 'out_w_gini_salary', 'out_f_avg_profit', 'out_f_avg_yield', lst_op_key = ['out_w_avg_salary', 'out_w_gini_salary', 'out_f_avg_profit', 'out_f_avg_yield',
'out_f_gini_profit', 'out_w_percent_hired'] 'out_f_gini_profit', 'out_w_percent_hired']
lst_2d_op_avg = [] lst_2d_op_avg = []
@ -32,9 +46,11 @@ for idx_row in range(idx_start-1, n_row, 1):
print(f"Running the {idx_row + 1}-th experiment at {datetime.datetime.now()}.") print(f"Running the {idx_row + 1}-th experiment at {datetime.datetime.now()}.")
lst_value = [] lst_value = []
for idx_col, k in enumerate(lst_xv_keys): for idx_col, k in enumerate(lst_xv_keys):
# read the corresponding value by mapping OA and xv.csv
oa_idx_level = int(df_oa.iat[idx_row, idx_col]) oa_idx_level = int(df_oa.iat[idx_row, idx_col])
lst_value.append(df_xv.iloc[oa_idx_level][k]) lst_value.append(df_xv.iloc[oa_idx_level][k])
lst_2d_xv.append(lst_value) lst_2d_xv.append(lst_value)
# merge the two dictionaries into one; send it to the AgentPy model
dct_merge = {**model_para, **dict(zip(lst_xv_keys, lst_value))} dct_merge = {**model_para, **dict(zip(lst_xv_keys, lst_value))}
# pprint.pprint(dct_merge) # pprint.pprint(dct_merge)
lst_2d_op = [] lst_2d_op = []
@ -48,6 +64,7 @@ for idx_row in range(idx_start-1, n_row, 1):
if not the_model.running: if not the_model.running:
break break
# record the six outputs
lst_2d_op.append([the_model.out_w_avg_salary, the_model.out_w_gini_salary, lst_2d_op.append([the_model.out_w_avg_salary, the_model.out_w_gini_salary,
the_model.out_f_avg_profit, the_model.out_f_avg_yield, the_model.out_f_avg_profit, the_model.out_f_avg_yield,
the_model.out_f_gini_profit, the_model.out_w_percent_hired]) the_model.out_f_gini_profit, the_model.out_w_percent_hired])
@ -64,5 +81,6 @@ for idx_row in range(idx_start-1, n_row, 1):
df_final = pd.concat([df_final.reset_index(drop=True), df_final = pd.concat([df_final.reset_index(drop=True),
pd.DataFrame(lst_2d_op_avg, columns=lst_op_key).reset_index(drop=True)], axis=1) pd.DataFrame(lst_2d_op_avg, columns=lst_op_key).reset_index(drop=True)], axis=1)
# save to excel files
df_final.to_excel(f'result/experiment_result_{idx_row + 1}.xlsx') df_final.to_excel(f'result/experiment_result_{idx_row + 1}.xlsx')
df_final.to_csv(f'result/experiment_result_{idx_row + 1}.csv') df_final.to_csv(f'result/experiment_result_{idx_row + 1}.csv')