Skip to content

Commit 22b8836

Browse files
committed
add layer_dropout
1 parent 57b3ba0 commit 22b8836

13 files changed

+72
-22
lines changed
File renamed without changes.

KerasLayer/gate_attention.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# ! -*- coding: utf-8 -*-
2+
from keras.engine.topology import Layer
3+
from keras.regularizers import *
4+
import tensorflow as tf
5+
import keras.backend as K
6+
7+
class GateAttention(Layer):
8+
def __init__(self, filters, dropout=0.0, regularizer=l2(3e-7), **kwargs):
9+
self.filters = filters
10+
self.dropout = dropout
11+
self.regularizer = regularizer
12+
super(GateAttention, self).__init__(**kwargs)
13+
14+
def build(self, input_shape):
15+
self.WC = self.add_weight(name='WC',
16+
shape=(input_shape[0][-1], self.filters),
17+
regularizer=self.regularizer,
18+
initializer='glorot_uniform',
19+
trainable=True)
20+
self.WQ = self.add_weight(name='WQ',
21+
shape=(input_shape[1][-1], self.filters),
22+
regularizer=self.regularizer,
23+
initializer='glorot_uniform',
24+
trainable=True)
25+
self.V = self.add_weight(name='V',
26+
shape=(2 * input_shape[1][-1], self.filters),
27+
regularizer=self.regularizer,
28+
initializer='glorot_uniform',
29+
trainable=True)
30+
super(GateAttention, self).build(input_shape)
31+
32+
def mask_logits(self, inputs, mask, clen, mask_value=-1e12):
33+
shapes = [x if x != None else -1 for x in inputs.shape.as_list()]
34+
mask = K.cast(mask, tf.int32)
35+
mask = K.one_hot(mask[:, 0], shapes[-1])
36+
mask = 1 - K.cumsum(mask, 1)
37+
mask = tf.cast(mask, tf.float32)
38+
mask = tf.tile(tf.expand_dims(mask, axis=1), [1, clen, 1])
39+
return inputs + mask_value * (1 - mask)
40+
41+
def call(self, x, mask=None):
42+
x_cont, x_ques, ques_len = x
43+
input_shape_ = x_cont.shape.as_list()
44+
x_cont_ = tf.nn.relu(K.dot(x_cont, self.WC))
45+
x_ques_ = tf.nn.relu(K.dot(x_ques, self.WQ))
46+
logits = tf.matmul(x_cont_, x_ques_, transpose_b=True) / (self.filters ** 0.5)
47+
logits = self.mask_logits(logits, ques_len, clen=input_shape_[1])
48+
logits = tf.nn.softmax(logits)
49+
C = tf.matmul(logits, x_ques)
50+
res = tf.concat([x_cont, C], axis=2)
51+
gate = tf.nn.sigmoid(K.dot(res, self.V))
52+
return gate
53+
54+
def compute_output_shape(self, input_shape):
55+
return input_shape[0]

Attention/multihead_attention.py KerasLayer/multihead_attention.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,15 @@ def mask_logits(self, inputs, mask, mask_value=-1e12):
3636
mask = tf.reshape(mask, [shapes[0], 1, 1, shapes[-1]])
3737
return inputs + mask_value * (1 - mask)
3838

39-
def dot_product_attention(self, x, seq_len=None, dropout=0.1):
39+
def dot_product_attention(self, x, seq_len=None, dropout=0.1, training=None):
4040
q, k, v = x
4141
logits = tf.matmul(q, k, transpose_b=True)
4242
if self.bias:
4343
logits += self.b
4444
if seq_len is not None:
4545
logits = self.mask_logits(logits, seq_len)
4646
weights = tf.nn.softmax(logits, name="attention_weights")
47-
weights = tf.nn.dropout(weights, 1.0 - dropout)
47+
weights = K.in_train_phase(K.dropout(weights, dropout), weights, training=training)
4848
x = tf.matmul(weights, v)
4949
return x
5050

@@ -56,7 +56,7 @@ def combine_last_two_dimensions(self, x):
5656
ret.set_shape(new_shape)
5757
return ret
5858

59-
def call(self, x, mask=None):
59+
def call(self, x, mask=None, training=None):
6060
memory, query, seq_len = x
6161
Q = self.split_last_dimension(query, self.num_heads)
6262
memory = tf.split(memory, 2, axis=2)
@@ -65,7 +65,7 @@ def call(self, x, mask=None):
6565

6666
key_depth_per_head = self.units // self.num_heads
6767
Q *= (key_depth_per_head ** -0.5)
68-
x = self.dot_product_attention([Q, K, V], seq_len, dropout=self.dropout)
68+
x = self.dot_product_attention([Q, K, V], seq_len, dropout=self.dropout, training=training)
6969
x = self.combine_last_two_dimensions(tf.transpose(x, [0,2,1,3]))
7070

7171
return x
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

NetModel/QANet_keras.py

+7-12
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from keras.layers import *
22
from keras.regularizers import *
33
from keras.models import *
4-
from Attention.context2query_attention import context2query_attention
5-
from Attention.multihead_attention import Attention as MultiHeadAttention
6-
from Attention.position_embedding import Position_Embedding as PositionEmbedding
4+
from KerasLayer.context2query_attention import context2query_attention
5+
from KerasLayer.multihead_attention import Attention as MultiHeadAttention
6+
from KerasLayer.position_embedding import Position_Embedding as PositionEmbedding
77
from keras import layers
88
from keras.optimizers import *
99
from keras.callbacks import *
10+
from KerasLayer.layer_dropout import LayerDropout
1011
from keras.initializers import *
1112

1213
regularizer = l2(3e-7)
@@ -21,12 +22,6 @@ def mask_logits(inputs, mask, mask_value=-1e12, axis=1, time_dim=1):
2122
mask = tf.expand_dims(mask, axis)
2223
return inputs + mask_value * (1 - mask)
2324

24-
def layer_dropout(x, residual, dropout):
25-
pred = tf.random_uniform([]) < dropout
26-
x = Dropout(dropout)(x)
27-
x = layers.add([x, residual])
28-
return Lambda(lambda x: tf.cond(pred, lambda: x[1], lambda: x[0]))([x, residual])
29-
3025
def highway(highway_layers, x, num_layers=2, dropout=0.0):
3126
# reduce dim
3227
x = highway_layers[0](x)
@@ -45,7 +40,7 @@ def conv_block(conv_layers, x, num_conv=4, dropout=0.0, l=1., L=1.):
4540
x = Dropout(dropout)(x)
4641
x = conv_layers[i][0](x)
4742
x = conv_layers[i][1](x)
48-
x = layer_dropout(x, residual, dropout * (l / L))
43+
x = LayerDropout(dropout * (l / L))([x, residual])
4944
x = Lambda(lambda v: tf.squeeze(v, axis=2))(x)
5045
return x
5146

@@ -56,7 +51,7 @@ def attention_block(attention_layer, x, seq_len, dropout=0.0, l=1., L=1.):
5651
x1 = attention_layer[0](x)
5752
x2 = attention_layer[1](x)
5853
x = attention_layer[2]([x1,x2,seq_len])
59-
x = layer_dropout(x, residual, dropout * (l / L))
54+
x = LayerDropout(dropout * (l / L))([x, residual])
6055
return x
6156

6257
def feed_forward_block(FeedForward_layers, x, dropout=0.0, l=1., L=1.):
@@ -65,7 +60,7 @@ def feed_forward_block(FeedForward_layers, x, dropout=0.0, l=1., L=1.):
6560
x = Dropout(dropout)(x)
6661
x = FeedForward_layers[0](x)
6762
x = FeedForward_layers[1](x)
68-
x = layer_dropout(x, residual, dropout * (l / L))
63+
x = LayerDropout(dropout * (l / L))([x, residual])
6964
return x
7065

7166
def output_block(x1, x2, ans_limit=50):

NetModel/QANet_tensorflow/QANet_model.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import tensorflow as tf
2-
from layers import regularizer, residual_block, highway, conv, mask_logits, optimized_trilinear_for_attention, total_params
2+
from NetModel.QANet_tensorflow.layers import regularizer, residual_block, highway, conv, mask_logits, optimized_trilinear_for_attention, total_params
33

44
class Model(object):
55
def __init__(self, config, word_mat=None, char_mat=None, test=False):
@@ -12,12 +12,12 @@ def __init__(self, config, word_mat=None, char_mat=None, test=False):
1212
self.ans_limit = config['ans_limit']
1313
self.filters = config['filters']
1414
self.num_heads = config['num_heads']
15-
self.dropout = config['dropout']
1615
self.batch_size = config['batch_size']
1716
self.l2_norm = config['l2_norm']
1817
self.decay = config['decay']
1918
self.learning_rate = config['learning_rate']
2019
self.grad_clip = config['grad_clip']
20+
self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
2121

2222
# embedding layer
2323
self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32), trainable=False)

NetModel/QANet_tensorflow/layers.py

-1
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,6 @@ def dot_product_attention(q,
289289
# shapes = [x if x != None else -1 for x in logits.shape.as_list()]
290290
mask = tf.expand_dims(mask, axis=1)
291291
mask = tf.expand_dims(mask, axis=1)
292-
# mask = tf.reshape(mask, [shapes[0],1,1,shapes[-1]])
293292
logits = mask_logits(logits, mask)
294293
weights = tf.nn.softmax(logits, name="attention_weights")
295294
# dropping out the attention links for each of the heads

NetModel/QANet_tensorflow/train.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import numpy as np
22
import pandas as pd
3-
import QANet_model
3+
from NetModel.QANet_tensorflow import QANet_model
44
import tensorflow as tf
5-
import util
5+
from NetModel.QANet_tensorflow import util
66
import json
77
import os
88
import time
@@ -115,7 +115,8 @@ def cal_ETA(t_start, i, n_batch):
115115
loss_value, _ = sess.run([model.loss, model.train_op],
116116
feed_dict={model.contw_input_: contw_input, model.quesw_input_: quesw_input,
117117
model.contc_input_: contc_input, model.quesc_input_: quesc_input,
118-
model.y_start_: y_start, model.y_end_: y_end})
118+
model.y_start_: y_start, model.y_end_: y_end,
119+
model.dropout: config['dropout']})
119120
sum_loss += loss_value
120121
last_train_str = "\r[epoch:%d/%d, steps:%d/%d] -ETA: %ds -loss: %.4f" % (
121122
epoch + 1, config['epoch'], i + 1, n_batch, cal_ETA(t_start, i, n_batch), sum_loss / (i + 1))

0 commit comments

Comments
 (0)