-
Notifications
You must be signed in to change notification settings - Fork 2
/
db_debias.py
150 lines (122 loc) · 6.83 KB
/
db_debias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
To run: db_debias.py infile_name outfile_name WEAT_NUM [debias/overbias]
"""
import sys
from wordlists import wordlists
from WEAT import weat
def fetch_wordlists(weat_type):
xweat = weat.XWEAT()
if weat_type == '6':
targets_1 = ['grandfather', 'uncle', 'son', 'boy', 'father', 'he', 'him', 'his', 'man', 'male', 'brother']
targets_2 = ['daughter', 'she', 'her', 'grandmother', 'mother', 'aunt', 'sister', 'hers', 'woman', 'female', 'girl']
attributes_1 = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
attributes_2 = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
if weat_type == '7':
targets_1 = ["math", "algebra", "geometry", "calculus", "equations", "computation", "numbers", "addition"]
targets_2 = ["poetry", "art", "dance", "literature", "novel", "symphony", "drama", "sculpture"]
attributes_1 = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
attributes_2 = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
if weat_type == '8':
targets_1 = ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"]
targets_2 = ["poetry", "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
attributes_1 = ["brother", "father", "uncle", "grandfather", "son", "he", "his", "him"]
attributes_2 = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]
if weat_type == 'all_weat':
# Expanded gender and career + math + sci vs. family + art
targets_1 = wordlists.all_career + wordlists.all_math_sci
attributes_1 = wordlists.all_male
targets_2 = wordlists.all_family + wordlists.all_art
attributes_2 = wordlists.all_female
if weat_type == 'pleasant_male':
t1, t2, attributes_1, attributes_2 = xweat.weat_4() # 1 is good, 2 is bad
targets_1, targets_2 = wordlists.all_male, wordlists.all_female
if weat_type == 'pleasant_female':
# same as male only good and bad are reversed and female is good
t1, t2, attributes_1, attributes_2 = xweat.weat_4() # 1 is good, 2 is bad
targets_1, targets_2 = wordlists.all_female, wordlists.all_male,
if weat_type == "gender_es":
targets_1, targets_2, attributes_1, attributes_2 = xweat.weat_gender_es(expansion=True)
if weat_type == "migrant_es":
targets_1, targets_2, attributes_1, attributes_2 = xweat.weat_migrant_es(expansion=True)
if weat_type == "gender_es_pleasant":
a, b, targets_1, targets_2 = xweat.weat_gender_es(expansion=True)
a, b, attributes_1, attributes_2 = xweat.weat_migrant_es(expansion=True) # first attributes positive, second negative
return targets_1, attributes_1, targets_2, attributes_2
if __name__ == "__main__":
infilename, outfilename, weat_type, bias_type, max_bias = sys.argv[1:6]
max_bias = True if max_bias == "True" else False
targets_1, attributes_1, targets_2, attributes_2 = fetch_wordlists(weat_type)
print("Using:\n"
"group1_targets: {}\n"
"group1_attributes: {}\n"
"group2_targets: {}\n"
"group2_attributes: {}".format(targets_1, attributes_1, targets_2, attributes_2))
with open(infilename, 'r') as infile:
infile = infile.readlines()
total_lines = len(infile)
with open(outfilename, 'w') as outfile:
a1_pro = 0
a1_anti = 0
a2_pro = 0
a2_anti = 0
probias_lines = 0
new_lines = 0
# get stats
for line in infile:
if any(target in line.split() for target in targets_1) and any(
attribute in line.split() for attribute in attributes_1):
a1_pro += 1
if any(target in line.split() for target in targets_2) and any(
attribute in line.split() for attribute in attributes_1):
a1_anti += 1
if any(target in line.split() for target in targets_2) and any(
attribute in line.split() for attribute in attributes_2):
a2_pro += 1
if any(target in line.split() for target in targets_1) and any(
attribute in line.split() for attribute in attributes_2):
a2_anti += 1
if bias_type == "debias":
extreme = max_bias # triggers unbalancing in the debias direction (by removing all probias instances)
new_a1_pro = 0
new_a2_pro = 0
for line in infile:
# balances so that there are not more pro terms than anti terms (though there can be more anti than pro)
if any(target in line.split() for target in targets_1) and any(attribute in line.split() for attribute in attributes_1):
new_a1_pro += 1
if extreme == True:
continue
if new_a1_pro <= a1_anti: # if pro stereotypical less than anti, write it, otherwise do not
outfile.write(line)
probias_lines += 1
new_lines += 1
elif any(target in line.split() for target in targets_2) and any(attribute in line.split() for attribute in attributes_2):
new_a2_pro += 1
if extreme == True:
continue
if new_a2_pro <= a2_anti:
outfile.write(line)
probias_lines += 1
new_lines += 1
else: # either neutral or nonbiased
outfile.write(line)
new_lines += 1
antibias_lines = a1_anti+a2_anti # since don't remove any
elif bias_type == "overbias":
for line in infile:
# removes files
if any(target in line.split() for target in targets_2) and any(attribute in line.split() for attribute in attributes_1):
continue
if any(target in line.split() for target in targets_1) and any(
attribute in line.split() for attribute in attributes_2):
continue
else:
outfile.write(line)
new_lines += 1
probias_lines = a1_pro + a2_pro # since don't remove any of them
antibias_lines = 0
print("Original File: {} Lines\n"
"{:.2f}% probias {:.2f}% antibias\n"
"New File: {} Lines ({}% of original)\n"
"{:.2f}% probias, {:.2f}% antibias\n"
"".format(total_lines, (a1_pro+a2_pro)/total_lines*100, (a1_anti+a2_anti)/total_lines*100,
new_lines, new_lines/total_lines*100, probias_lines/new_lines*100, antibias_lines/new_lines*100))