|
39 | 39 | "import pickle\n",
|
40 | 40 | "from clean import *\n",
|
41 | 41 | "from reed import regex_select\n",
|
| 42 | + "from hilda_config import release_suffix\n", |
42 | 43 | "\n",
|
43 | 44 | "pd.options.display.max_columns=100\n",
|
44 | 45 | "pd.options.display.max_colwidth=200\n",
|
|
80 | 81 | "correlation_threshold = 0.90\n",
|
81 | 82 | "redundant_threshold=0.9\n",
|
82 | 83 | "test = False\n",
|
83 |
| - "release = \"general\" # \"restricted\"" |
| 84 | + "release = \"restricted\" # \"restricted\"" |
84 | 85 | ]
|
85 | 86 | },
|
86 | 87 | {
|
|
191 | 192 | "\n",
|
192 | 193 | "\n",
|
193 | 194 | "# read the combined file for the starting wave\n",
|
194 |
| - "release_suffix = {\n", |
195 |
| - " \"general\": \"c\",\n", |
196 |
| - " \"restricted\": \"u\"\n", |
197 |
| - "}\n", |
198 |
| - "\n", |
199 |
| - "sfx = config.release_suffix[release]\n", |
| 195 | + "sfx = release_suffix[release]\n", |
200 | 196 | "df1, meta1 = pyreadstat.read_sav(f'data/part1/Combined {s}190{sfx}.sav')\n",
|
201 | 197 | "n0 = len(df1)\n",
|
202 | 198 | "print(f\"Number of people in initial wave {n0}\")\n",
|
203 |
| - "with open('data/metadata.pkl','wb') as f:\n", |
| 199 | + "with open(f'data/metadata_{release}.pkl','wb') as f:\n", |
204 | 200 | " pickle.dump(meta1,f)\n",
|
205 | 201 | " \n",
|
206 | 202 | "df1 = filter_participants(df1,min_start_age, already_studying_cols)"
|
|
687 | 683 | "\n",
|
688 | 684 | "if not test:\n",
|
689 | 685 | " X, columns_dropped, r_vals = filter_raw_data(df1.copy(), missing_threshold=missing_threshold,correlation_threshold=correlation_threshold)\n",
|
690 |
| - " write_data(X, treatment_outcomes, \"all_vars.csv\")" |
| 686 | + " write_data(X, treatment_outcomes, f\"all_vars_{release}.csv\")" |
691 | 687 | ]
|
692 | 688 | },
|
693 | 689 | {
|
|
834 | 830 | " print(\"\\n\")\n",
|
835 | 831 | " return f_selected\n",
|
836 | 832 | "\n",
|
837 |
| - "def write_selected_featureset(X, columns_dropped, features, tag):\n", |
| 833 | + "def write_selected_featureset(X, columns_dropped, features, tag, release):\n", |
838 | 834 | " selection = ['xwaveid']\n",
|
839 | 835 | " missing = []\n",
|
840 | 836 | " for f in features:\n",
|
|
844 | 840 | " reason = columns_dropped.get(f,\"unkown\")\n",
|
845 | 841 | " print(f\"Column {f} not present in X, reason:{reason}\")\n",
|
846 | 842 | " \n",
|
847 |
| - " write_data(X[selection], treatment_outcomes, f\"all_lasso_selected_{tag}.csv\")" |
| 843 | + " write_data(X[selection], treatment_outcomes, f\"all_lasso_selected_{tag}_{release}.csv\")" |
848 | 844 | ]
|
849 | 845 | },
|
850 | 846 | {
|
|
862 | 858 | "\n",
|
863 | 859 | " X, columns_dropped, r_vals = filter_raw_data(df1.copy(), missing_threshold=0.99,correlation_threshold=1)\n",
|
864 | 860 | "\n",
|
865 |
| - " write_selected_featureset(X, columns_dropped, f10, 10)\n", |
866 |
| - " write_selected_featureset(X, columns_dropped, f20, 20)\n", |
867 |
| - " write_selected_featureset(X, columns_dropped, f50, 50)\n", |
868 |
| - " write_selected_featureset(X, columns_dropped, f100, 100)\n", |
| 861 | + " write_selected_featureset(X, columns_dropped, f10, 10, release)\n", |
| 862 | + " write_selected_featureset(X, columns_dropped, f20, 20, release)\n", |
| 863 | + " write_selected_featureset(X, columns_dropped, f50, 50, release)\n", |
| 864 | + " write_selected_featureset(X, columns_dropped, f100, 100, release)\n", |
869 | 865 | " for f in f20:\n",
|
870 | 866 | " print(f, meta1.column_names_to_labels.get(f))"
|
871 | 867 | ]
|
|
900 | 896 | "# X_su = X[selected_cols]\n",
|
901 | 897 | "# print(\"number of features selected:\",len(selected_cols))\n",
|
902 | 898 | "# print(\"effective rank:\",effective_rank(Xs[selected]))\n",
|
903 |
| - "# write_data(X_su.reset_index(), treatment_outcomes, \"all_unsupervised_selected.csv\")" |
| 899 | + "# write_data(X_su.reset_index(), treatment_outcomes, f\"all_unsupervised_selected_{release}.csv\")" |
904 | 900 | ]
|
905 | 901 | }
|
906 | 902 | ],
|
|
0 commit comments