1
- #ifndef SOFTMAXLOSS_H
1
+ #ifndef SOFTMAXLOSS_H
2
2
#define SOFTMAXLOSS_H
3
3
4
- // #include <../3rdparty/Eigen/Dense>
5
4
#include < Eigen/Dense>
6
5
#include " multinomial.h"
7
6
#include " util.h"
@@ -20,55 +19,55 @@ enum loss_function_type { LogLoss, NCELoss, InvalidLoss };
20
19
21
20
inline loss_function_type string_to_loss_function (const std::string &s)
22
21
{
23
- if (s == " log" )
24
- return LogLoss;
25
- else if (s == " nce" )
26
- return NCELoss;
27
- else
28
- return InvalidLoss;
22
+ if (s == " log" )
23
+ return LogLoss;
24
+ else if (s == " nce" )
25
+ return NCELoss;
26
+ else
27
+ return InvalidLoss;
29
28
}
30
29
31
30
inline std::string loss_function_to_string (loss_function_type f)
32
31
{
33
- if (f == LogLoss)
34
- return " log" ;
35
- else if (f == NCELoss)
36
- return " nce" ;
32
+ if (f == LogLoss)
33
+ return " log" ;
34
+ else if (f == NCELoss)
35
+ return " nce" ;
37
36
}
38
37
39
38
// / Note: Outputs log-probabilities.
40
39
41
40
struct SoftmaxLogLoss
42
41
{
43
- template <typename DerivedI, typename DerivedW, typename DerivedO>
44
- void fProp (const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
42
+ template <typename DerivedI, typename DerivedW, typename DerivedO>
43
+ void fProp (const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
44
+ {
45
+ UNCONST (DerivedO, output_const, output);
46
+
47
+ double log_likelihood = 0.0 ;
48
+
49
+ #pragma omp parallel for reduction(+:log_likelihood)
50
+ for (int train_id = 0 ; train_id < input.cols (); train_id++)
45
51
{
46
- UNCONST (DerivedO, output_const, output);
47
-
48
- double log_likelihood = 0.0 ;
49
-
50
- #pragma omp parallel for reduction(+:log_likelihood)
51
- for (int train_id = 0 ; train_id < input.cols (); train_id++)
52
- {
53
- double normalization = logsum (input.col (train_id));
54
- output.col (train_id).array () = input.col (train_id).array () - normalization;
55
- log_likelihood += output (output_words (train_id), train_id);
56
- }
57
- loss = log_likelihood;
52
+ double normalization = logsum (input.col (train_id));
53
+ output.col (train_id).array () = input.col (train_id).array () - normalization;
54
+ log_likelihood += output (output_words (train_id), train_id);
58
55
}
59
-
60
- template <typename DerivedW, typename DerivedO, typename DerivedI>
61
- void bProp (const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
56
+ loss = log_likelihood;
57
+ }
58
+
59
+ template <typename DerivedW, typename DerivedO, typename DerivedI>
60
+ void bProp (const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
61
+ {
62
+ UNCONST (DerivedI, grad_input_const, grad_input);
63
+ grad_input.setZero ();
64
+ #pragma omp parallel for
65
+ for (int train_id = 0 ; train_id < output.cols (); train_id++)
62
66
{
63
- UNCONST (DerivedI, grad_input_const, grad_input);
64
- grad_input.setZero ();
65
- #pragma omp parallel for
66
- for (int train_id = 0 ; train_id < output.cols (); train_id++)
67
- {
68
- grad_input (output_words (train_id), train_id) += 1 .;
69
- grad_input.col (train_id) -= output.col (train_id).array ().exp ().matrix ();
70
- }
67
+ grad_input (output_words (train_id), train_id) += 1 .;
68
+ grad_input.col (train_id) -= output.col (train_id).array ().exp ().matrix ();
71
69
}
70
+ }
72
71
};
73
72
74
73
// /// Softmax layer plus NCE loss function.
@@ -81,55 +80,55 @@ struct SoftmaxLogLoss
81
80
template <typename Multinomial>
82
81
class SoftmaxNCELoss
83
82
{
84
- const Multinomial &unigram;
83
+ const Multinomial &unigram;
85
84
86
- public:
87
- SoftmaxNCELoss (const Multinomial &unigram)
85
+ public:
86
+ SoftmaxNCELoss (const Multinomial &unigram)
88
87
: unigram(unigram)
88
+ {
89
+ }
90
+
91
+ template <typename DerivedI, typename DerivedW, typename DerivedO>
92
+ void fProp (const MatrixBase<DerivedI> &scores,
93
+ const MatrixBase<DerivedW> &minibatch_samples,
94
+ const MatrixBase<DerivedO> &output_const, double &loss)
95
+ {
96
+ UNCONST (DerivedO, output_const, output);
97
+ double log_likelihood = 0.0 ;
98
+ int num_noise_samples = minibatch_samples.rows ()-1 ;
99
+ double log_num_noise_samples = std::log (num_noise_samples);
100
+ #pragma omp parallel for reduction(+:log_likelihood) schedule(static)
101
+ for (int train_id = 0 ; train_id < scores.cols (); train_id++)
89
102
{
103
+ for (int sample_id = 0 ;sample_id < minibatch_samples.rows (); sample_id++)
104
+ {
105
+ int sample = minibatch_samples (sample_id, train_id);
106
+ // To avoid zero or infinite probabilities,
107
+ // never take exp of score without normalizing first,
108
+ // even if it's a little slower...
109
+ double score = scores (sample_id, train_id);
110
+ double score_noise = log_num_noise_samples + unigram.logprob (sample);
111
+ double z = logadd (score, score_noise);
112
+ double logprob = score - z;
113
+ double logprob_noise = score_noise - z;
114
+ output (sample_id, train_id) = std::exp (logprob);
115
+ log_likelihood += sample_id == 0 ? logprob : logprob_noise;
116
+ }
90
117
}
91
-
92
- template <typename DerivedI, typename DerivedW, typename DerivedO>
93
- void fProp (const MatrixBase<DerivedI> &scores,
94
- const MatrixBase<DerivedW> &minibatch_samples,
95
- const MatrixBase<DerivedO> &output_const, double &loss)
96
- {
97
- UNCONST (DerivedO, output_const, output);
98
- double log_likelihood = 0.0 ;
99
- int num_noise_samples = minibatch_samples.rows ()-1 ;
100
- double log_num_noise_samples = std::log (num_noise_samples);
101
- #pragma omp parallel for reduction(+:log_likelihood) schedule(static)
102
- for (int train_id = 0 ; train_id < scores.cols (); train_id++)
103
- {
104
- for (int sample_id = 0 ;sample_id < minibatch_samples.rows (); sample_id++)
105
- {
106
- int sample = minibatch_samples (sample_id, train_id);
107
- // To avoid zero or infinite probabilities,
108
- // never take exp of score without normalizing first,
109
- // even if it's a little slower...
110
- double score = scores (sample_id, train_id);
111
- double score_noise = log_num_noise_samples + unigram.logprob (sample);
112
- double z = logadd (score, score_noise);
113
- double logprob = score - z;
114
- double logprob_noise = score_noise - z;
115
- output (sample_id, train_id) = std::exp (logprob);
116
- log_likelihood += sample_id == 0 ? logprob : logprob_noise;
117
- }
118
- }
119
- loss = log_likelihood;
120
- }
121
-
122
- template <typename DerivedO, typename DerivedI>
123
- void bProp (const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
118
+ loss = log_likelihood;
119
+ }
120
+
121
+ template <typename DerivedO, typename DerivedI>
122
+ void bProp (const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
123
+ {
124
+ UNCONST (DerivedI, output_const, output);
125
+ #pragma omp parallel for schedule(static)
126
+ for (int train_id = 0 ; train_id < probs.cols (); train_id++)
124
127
{
125
- UNCONST (DerivedI, output_const, output);
126
- #pragma omp parallel for schedule(static)
127
- for (int train_id = 0 ; train_id < probs.cols (); train_id++)
128
- {
129
- output.col (train_id) = -probs.col (train_id);
130
- output (0 , train_id) += 1.0 ;
131
- }
128
+ output.col (train_id) = -probs.col (train_id);
129
+ output (0 , train_id) += 1.0 ;
132
130
}
131
+ }
133
132
};
134
133
135
134
} // namespace nplm
0 commit comments