Perform all ops in apply_gradient in a given tf.name_scope. #17550

foxik · 2023-02-10T20:15:25Z

Hi,

today when looking in Tensorboard, I found out that the new optimizers do not create operations in a tf.name_scope of the name given to the Optimizer constructor.

That was the case previously -- see

keras/keras/optimizers/legacy/optimizer_v2.py

Lines 701 to 754 in bdbca4f

    
           with tf.name_scope(self._name): 
        
               # Create iteration if necessary. 
        
               with tf.init_scope(): 
        
                   self._create_all_weights(var_list) 
        
               if not grads_and_vars: 
        
                   # Distribution strategy does not support reducing an empty list 
        
                   # of gradients 
        
                   return tf.no_op() 
        
               if tf.distribute.in_cross_replica_context(): 
        
                   raise RuntimeError( 
        
                       "`apply_gradients() cannot be called in cross-replica " 
        
                       "context. Use `tf.distribute.Strategy.run` to enter " 
        
                       "replica context. For more information, please see the " 
        
                       "docstring of `tf.distribute.get_replica_context`." 
        
                   ) 
        
               strategy = tf.distribute.get_strategy() 
        
               if ( 
        
                   not experimental_aggregate_gradients 
        
                   and strategy 
        
                   and isinstance( 
        
                       strategy, 
        
                       ( 
        
                           tf.compat.v1.distribute.experimental.ParameterServerStrategy,  # noqa: E501 
        
                           tf.distribute.experimental.ParameterServerStrategy, 
        
                           tf.distribute.experimental.CentralStorageStrategy, 
        
                           tf.compat.v1.distribute.experimental.CentralStorageStrategy,  # noqa: E501 
        
                       ), 
        
                   ) 
        
               ): 
        
                   raise NotImplementedError( 
        
                       "`experimental_aggregate_gradients=False is not supported " 
        
                       "for ParameterServerStrategy and CentralStorageStrategy. " 
        
                       f"Used: strategy={strategy}." 
        
                   ) 
        
               apply_state = self._prepare(var_list) 
        
               if experimental_aggregate_gradients: 
        
                   grads_and_vars = self._transform_unaggregated_gradients( 
        
                       grads_and_vars 
        
                   ) 
        
                   grads_and_vars = self._aggregate_gradients(grads_and_vars) 
        
               grads_and_vars = self._transform_gradients(grads_and_vars) 
        
               return tf.__internal__.distribute.interim.maybe_merge_call( 
        
                   functools.partial( 
        
                       self._distributed_apply, apply_state=apply_state 
        
                   ), 
        
                   strategy, 
        
                   grads_and_vars, 
        
                   name=name, 
        
               )

while the new Optimizer runs only build in the name scope, not the update operations, see

keras/keras/optimizers/optimizer.py

Lines 633 to 656 in bdbca4f

    
           with tf.name_scope(scope_name): 
        
               with tf.init_scope(): 
        
                   # Lift variable creation to init scope to avoid environment 
        
                   # issues. 
        
                   self.build(trainable_variables) 
        
           grads_and_vars = list(zip(grads, trainable_variables)) 
        
           grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars) 
        
           if len(list(grads_and_vars)) == 0: 
        
               # Check again after filtering gradients. 
        
               return self._iterations 
        
           grads, trainable_variables = zip(*grads_and_vars) 
        
           grads = self._clip_gradients(grads) 
        
           grads = self._deduplicate_sparse_grad(grads) 
        
           self._apply_weight_decay(trainable_variables) 
        
           grads_and_vars = list(zip(grads, trainable_variables)) 
        
           iteration = self._internal_apply_gradients(grads_and_vars) 
        
           # Apply variable constraints after applying gradients. 
        
           for variable in trainable_variables: 
        
               if variable.constraint is not None: 
        
                   variable.assign(variable.constraint(variable)) 
        
           return iteration

.

Therefore, in this pull request, I moved the update operations in apply_gradient also inside the tf.name_scope.

The tests seem to run fine, code is correctly reformatted, and I verified in Tensorboard that after the change the update operations are indeed in a name scope.

qlzh727

Thanks for the fix. This must be a oversight when we create the new optimizer.

Imported from GitHub PR #17550 Hi, today when looking in Tensorboard, I found out that the new optimizers do not create operations in a `tf.name_scope` of the `name` given to the `Optimizer` constructor. That was the case previously -- see https://github.com/keras-team/keras/blob/bdbca4fb823b65f4dc5a9bb2acc0cf55e1276303/keras/optimizers/legacy/optimizer_v2.py#L701-L754 while the new `Optimizer` runs only `build` in the name scope, not the update operations, see https://github.com/keras-team/keras/blob/bdbca4fb823b65f4dc5a9bb2acc0cf55e1276303/keras/optimizers/optimizer.py#L633-L656. Therefore, in this pull request, I moved the update operations in `apply_gradient` also inside the `tf.name_scope`. The tests seem to run fine, code is correctly reformatted, and I verified in Tensorboard that after the change the update operations are indeed in a name scope. Copybara import of the project: -- 0dbf691 by Milan Straka <milan@strakovi.com>: Perform all ops in apply_gradient in a given tf.name_scope. Merging this change closes #17550 FUTURE_COPYBARA_INTEGRATE_REVIEW=#17550 from foxik:apply_gradients_in_a_name_space 0dbf691 PiperOrigin-RevId: 509550993

Perform all ops in apply_gradient in a given tf.name_scope.

0dbf691

google-ml-butler bot added the size:S label Feb 10, 2023

google-ml-butler bot assigned gbaned Feb 10, 2023

gbaned requested a review from qlzh727 February 13, 2023 17:49

google-ml-butler bot added the keras-team-review-pending label Feb 13, 2023

qlzh727 approved these changes Feb 13, 2023

View reviewed changes

google-ml-butler bot added kokoro:force-run ready to pull labels Feb 13, 2023

kokoro-team removed the kokoro:force-run label Feb 13, 2023

copybara-service bot mentioned this pull request Feb 14, 2023

PR #17550: Perform all ops in apply_gradient in a given tf.name_scope. #17565

Closed

copybara-service bot merged commit 55c0bee into keras-team:master Feb 14, 2023

foxik deleted the apply_gradients_in_a_name_space branch February 14, 2023 23:53

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Perform all ops in apply_gradient in a given tf.name_scope. #17550

Perform all ops in apply_gradient in a given tf.name_scope. #17550

foxik commented Feb 10, 2023

qlzh727 left a comment

	with tf.name_scope(self._name):
	# Create iteration if necessary.
	with tf.init_scope():
	self._create_all_weights(var_list)

	if not grads_and_vars:
	# Distribution strategy does not support reducing an empty list
	# of gradients
	return tf.no_op()

	if tf.distribute.in_cross_replica_context():
	raise RuntimeError(
	"`apply_gradients() cannot be called in cross-replica "
	"context. Use `tf.distribute.Strategy.run` to enter "
	"replica context. For more information, please see the "
	"docstring of `tf.distribute.get_replica_context`."
	)

	strategy = tf.distribute.get_strategy()
	if (
	not experimental_aggregate_gradients
	and strategy
	and isinstance(
	strategy,
	(
	tf.compat.v1.distribute.experimental.ParameterServerStrategy, # noqa: E501
	tf.distribute.experimental.ParameterServerStrategy,
	tf.distribute.experimental.CentralStorageStrategy,
	tf.compat.v1.distribute.experimental.CentralStorageStrategy, # noqa: E501
	),
	)
	):
	raise NotImplementedError(
	"`experimental_aggregate_gradients=False is not supported "
	"for ParameterServerStrategy and CentralStorageStrategy. "
	f"Used: strategy={strategy}."
	)

	apply_state = self._prepare(var_list)
	if experimental_aggregate_gradients:
	grads_and_vars = self._transform_unaggregated_gradients(
	grads_and_vars
	)
	grads_and_vars = self._aggregate_gradients(grads_and_vars)
	grads_and_vars = self._transform_gradients(grads_and_vars)

	return tf.__internal__.distribute.interim.maybe_merge_call(
	functools.partial(
	self._distributed_apply, apply_state=apply_state
	),
	strategy,
	grads_and_vars,
	name=name,
	)

	with tf.name_scope(scope_name):
	with tf.init_scope():
	# Lift variable creation to init scope to avoid environment
	# issues.
	self.build(trainable_variables)
	grads_and_vars = list(zip(grads, trainable_variables))
	grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
	if len(list(grads_and_vars)) == 0:
	# Check again after filtering gradients.
	return self._iterations

	grads, trainable_variables = zip(*grads_and_vars)

	grads = self._clip_gradients(grads)
	grads = self._deduplicate_sparse_grad(grads)
	self._apply_weight_decay(trainable_variables)
	grads_and_vars = list(zip(grads, trainable_variables))
	iteration = self._internal_apply_gradients(grads_and_vars)

	# Apply variable constraints after applying gradients.
	for variable in trainable_variables:
	if variable.constraint is not None:
	variable.assign(variable.constraint(variable))
	return iteration

Perform all ops in apply_gradient in a given tf.name_scope. #17550

Perform all ops in apply_gradient in a given tf.name_scope. #17550

Conversation

foxik commented Feb 10, 2023

qlzh727 left a comment

Choose a reason for hiding this comment