[docs]classRemoteClusterExecutionError(RemoteError):"""Custom exception class for collecting multiple execution errors on a cluster."""def__init__(self,results:List[Tuple[NodeSet,MsgTreeElem]],failures:List[RemoteExecutionError]):"""Override the parent constructor to add failures and results as attributes."""super().__init__("{n} hosts have failed execution".format(n=len(failures)))self.failures=failuresself.results=results

[docs]classRemoteHostsAdapter:"""Base adapter to write classes that expand the capabilities of RemoteHosts. This adapter class is a helper class to reduce duplication when writing classes that needs to add capabilities to a RemoteHosts instance. The goal is to not extend the RemoteHosts but instead delegate to its instances. This class fits when a single RemoteHosts instance is enough, but for more complex cases, in which multiple RemoteHosts instances should be orchestrated, it's ok to not extend this class and create a standalone one. """def__init__(self,remote_hosts:'RemoteHosts')->None:"""Initialize the instance. Arguments: remote_hosts (spicerack.remote.RemoteHosts): the instance to act on the remote hosts. """self._remote_hosts=remote_hostsdef__str__(self)->str:"""String representation of the instance. Returns: str: the string representation of the target hosts. """returnstr(self._remote_hosts)def__len__(self)->int:"""Length of the instance. Returns: int: the number of target hosts. """returnlen(self._remote_hosts)

[docs]classLBRemoteCluster(RemoteHostsAdapter):"""Class usable to operate on a cluster of servers with pooling/depooling logic in conftool."""def__init__(self,config:Config,remote_hosts:'RemoteHosts',conftool:ConftoolEntity)->None:"""Initialize the instance. Arguments: config (spicerack.config.Config): cumin configuration. remote_hosts (spicerack.remote.RemoteHosts): the instance to act on the remote hosts. conftool (spicerack.confctl.ConftoolEntity): the conftool entity to operate on. """self._config=configself._conftool=conftoolsuper().__init__(remote_hosts)

[docs]defrun(self,*commands:Union[str,Command],svc_to_depool:Optional[List[str]]=None,batch_size:int=1,batch_sleep:Optional[float]=None,is_safe:bool=False,max_failed_batches:int=0)->List[Tuple[NodeSet,MsgTreeElem]]:"""Run commands while depooling servers in groups of batch_size. For clusters behind a load balancer, we typically want to be able to depool a server from a specific service, then run any number of commands on it, and finally repool it. We also want to ensure we can only have at max N hosts depooled at any time. Given cumin doesn't have pre- and post- execution hooks, we break the remote run in smaller groups and execute on one group at a time, in parallel on all the servers. Note this works a bit differently than how the cumin moving window works, as here we'll have to wait for the execution on all servers in a group before moving on to the next. Arguments: *commands (str, cumin.transports.Command): Arbitrary number of commands to execute. svc_to_depool (list): A list of services (in conftool) to depool. batch_size (int, optional): the batch size for cumin, as an integer. Defaults to 1. batch_sleep (float, optional): the batch sleep in seconds to use before scheduling the next batch of hosts. is_safe (bool, optional): whether the command is safe to run also in dry-run mode because it's a read-only max_failed_batches (int, optional): Maximum number of batches that can fail. Defaults to 0. Returns: list: cumin.transports.BaseWorker.get_results to allow to iterate over the results. Raises: spicerack.remote.RemoteExecutionError, spicerack.remote.RemoteClusterExecutionError: if the Cumin execution returns a non-zero exit code. """n_hosts=len(self._remote_hosts)# Ensure the batch size is smaller than the whole cluster. When acting on a cluster# we should never act on all hosts in parallel.# If that's needed, RemoteHosts can be used directly.# TODO: the right thing to do would be to check that batch_size is smaller than# (1 - depool_threshold) * pooled_hosts, but that would also need to know the current# state of the cluster.ifbatch_size<=0orbatch_size>=n_hosts:raiseRemoteError('Values for batch_size must be 0 < x < {}, got {}'.format(n_hosts,batch_size))# If no service needs depooling, the standard behavior of remote_hosts.run_async is used# TODO: add the ability to select all services.ifsvc_to_depoolisNone:returnlist(self._remote_hosts.run_async(*commands,success_threshold=(1.0-(max_failed_batches*batch_size)/n_hosts),batch_size=batch_size,batch_sleep=batch_sleep,is_safe=is_safe))results=[]# Find how much we must split the pool up to achieve the desired batch size.n_slices=math.ceil(n_hosts/batch_size)# TODO: add better failure handling. Right now we just support counting batches with a failure, while we might# want to support success_threshold.failures=[]forremotes_sliceinself._remote_hosts.split(n_slices):# Select the pooled servers for the selected services, from the group we're operating on now.withself._conftool.change_and_revert('pooled','yes','no',service='|'.join(svc_to_depool),name='|'.join(remotes_slice.hosts.striter())):try:forresultinremotes_slice.run_async(*commands,is_safe=is_safe,success_threshold=1.0):results.append(result)exceptRemoteExecutionErrorase:failures.append(e)# Break the execution loop if more than the maximum number of failures happened.iflen(failures)>max_failed_batches:break# TODO: skip sleep on the last runifbatch_sleepisnotNone:time.sleep(batch_sleep)iffailures:raiseRemoteClusterExecutionError(results,failures)returnresults

[docs]defrestart_services(self,services:List[str],svc_to_depool:List[str],*,batch_size:int=1,batch_sleep:Optional[float]=None)->List[Tuple[NodeSet,MsgTreeElem]]:"""Restart services in batches, removing the host from all the affected services first. Arguments: services (list): A list of services to act upon svc_to_depool (list): A list of services (in conftool) to depool. batch_size (int): the batch size for cumin, as an integer. Defaults to 1 batch_sleep (float, optional): the batch sleep between groups of runs. Returns: list: cumin.transports.BaseWorker.get_results to allow to iterate over the results. Raises: spicerack.remote.RemoteExecutionError, spicerack.remote.RemoteClusterExecutionError: if the Cumin execution returns a non-zero exit code. """returnself._act_on_services(services,svc_to_depool,'restart',batch_size,batch_sleep)

[docs]defreload_services(self,services:List[str],svc_to_depool:List[str],*,batch_size:int=1,batch_sleep:Optional[float]=None)->List[Tuple[NodeSet,MsgTreeElem]]:"""Reload services in batches, removing the host from all the affected services first. Arguments: services (list): A list of services to act upon svc_to_depool (list): A list of services (in conftool) to depool. batch_size (int): the batch size for cumin, as an integer.Defaults to 1 batch_sleep (float, optional): the batch sleep between groups of runs. Returns: list: cumin.transports.BaseWorker.get_results to allow to iterate over the results. Raises: spicerack.remote.RemoteExecutionError, spicerack.remote.RemoteClusterExecutionError: if the Cumin execution returns a non-zero exit code. """returnself._act_on_services(services,svc_to_depool,'reload',batch_size,batch_sleep)

def_act_on_services(self,services:List[str],svc_to_depool:List[str],what:str,batch_size:int,batch_sleep:Optional[float]=None)->List[Tuple[NodeSet,MsgTreeElem]]:"""Act on services in batches, depooling the servers first. Arguments: services (list): A list of services to act upon svc_to_depool (list): A list of services (in conftool) to depool. what (string): Action to perform. restart by default. batch_size (int): the batch size for cumin, as an integer. batch_sleep (float, optional): the batch sleep between groups of runs. Returns: list: cumin.transports.BaseWorker.get_results to allow to iterate over the results. Raises: spicerack.remote.RemoteExecutionError, spicerack.remote.RemoteClusterExecutionError: if the Cumin execution returns a non-zero exit code. """commands=['systemctl {w} "{s}"'.format(w=what,s=svc)forsvcinservices]returnself.run(*commands,svc_to_depool=svc_to_depool,batch_size=batch_size,batch_sleep=batch_sleep,is_safe=False)

[docs]classRemote:"""Remote class to interact with Cumin."""def__init__(self,config:str,dry_run:bool=True)->None:"""Initialize the instance. Arguments: config (str): the path of Cumin's configuration file. dry_run (bool, optional): whether this is a DRY-RUN. """self._config=Config(config)self._dry_run=dry_run

[docs]defquery_confctl(self,conftool:ConftoolEntity,**tags:str)->LBRemoteCluster:"""Execute a conftool node query and return the matching hosts. Arguments: conftool (spicerack.confctl.ConftoolEntity): the conftool instance for the node type objects. tags: Conftool tags for node type objects as keyword arguments. Returns: spicerack.remote.LBRemoteCluster: LBRemoteCluster instance matching the given query Raises: spicerack.remote.RemoteError """# get the list of hosts from confctltry:hosts_conftool=[obj.nameforobjinconftool.get(**tags)]query_string=','.join(hosts_conftool)exceptSpicerackErrorase:raiseRemoteError('Failed to execute the conftool query')fromeremote_hosts=self.query(query_string)host_diff=set(hosts_conftool)-set(remote_hosts.hosts)ifhost_diff:# pragma: no cover | This should never happen with a direct backend.logger.warning("Hosts present in conftool but not in puppet: %s",",".join(host_diff))returnLBRemoteCluster(self._config,remote_hosts,conftool)

[docs]classRemoteHosts:"""Remote Executor class. This class can be extended to customize the interaction with remote hosts passing a custom factory function to `spicerack.remote.Remote.query`. """def__init__(self,config:Config,hosts:NodeSet,dry_run:bool=True)->None:"""Initialize the instance. Arguments: config (cumin.Config): the configuration for Cumin. hosts (ClusterShell.NodeSet.NodeSet): the hosts to target for the remote execution. dry_run (bool, optional): whether this is a DRY-RUN. Raises: spicerack.remote.RemoteError: if no hosts were provided. """ifnothosts:raiseRemoteError('No hosts provided')self._config=configself._hosts=hostsself._dry_run=dry_run@propertydefhosts(self)->NodeSet:"""Getter for the hosts property. Returns: ClusterShell.NodeSet.NodeSet: a copy of the targeted hosts. """returnself._hosts.copy()def__str__(self)->str:"""String representation of the instance. Returns: str: the string representation of the target hosts. """returnstr(self._hosts)def__len__(self)->int:"""Length of the instance. Returns: int: the number of target hosts. """returnlen(self._hosts)

[docs]defsplit(self,n_slices:int)->Iterator['RemoteHosts']:"""Split the current remote in n_slices RemoteHosts instances. Arguments: n_slices (int): the number of slices to slice the remote in. Yields: The RemoteHosts instances for the subset of nodes. """fornodesetinself._hosts.split(n_slices):yieldRemoteHosts(self._config,nodeset,dry_run=self._dry_run)

[docs]defrun_async(self,*commands:Union[str,Command],success_threshold:float=1.0,batch_size:Optional[Union[int,str]]=None,batch_sleep:Optional[float]=None,is_safe:bool=False)->Iterator[Tuple[NodeSet,MsgTreeElem]]:"""Execute commands on hosts matching a query via Cumin in async mode. Arguments: *commands (str, cumin.transports.Command): arbitrary number of commands to execute on the target hosts. success_threshold (float, optional): to consider the execution successful, must be between 0.0 and 1.0. batch_size (int, str, optional): the batch size for cumin, either as percentage (e.g. ``25%``) or absolute number (e.g. ``5``). batch_sleep (float, optional): the batch sleep in seconds to use in Cumin before scheduling the next host. is_safe (bool, optional): whether the command is safe to run also in dry-run mode because it's a read-only command that doesn't modify the state. Returns: generator: cumin.transports.BaseWorker.get_results to allow to iterate over the results. Raises: RemoteExecutionError: if the Cumin execution returns a non-zero exit code. """returnself._execute(list(commands),mode='async',success_threshold=success_threshold,batch_size=batch_size,batch_sleep=batch_sleep,is_safe=is_safe)

[docs]defrun_sync(self,*commands:Union[str,Command],success_threshold:float=1.0,batch_size:Optional[Union[int,str]]=None,batch_sleep:Optional[float]=None,is_safe:bool=False)->Iterator[Tuple[NodeSet,MsgTreeElem]]:"""Execute commands on hosts matching a query via Cumin in sync mode. Arguments: *commands (str, cumin.transports.Command): arbitrary number of commands to execute on the target hosts. success_threshold (float, optional): to consider the execution successful, must be between 0.0 and 1.0. batch_size (int, str, optional): the batch size for cumin, either as percentage (e.g. ``25%``) or absolute number (e.g. ``5``). batch_sleep (float, optional): the batch sleep in seconds to use in Cumin before scheduling the next host. is_safe (bool, optional): whether the command is safe to run also in dry-run mode because it's a read-only command that doesn't modify the state. Returns: generator: cumin.transports.BaseWorker.get_results to allow to iterate over the results. Raises: RemoteExecutionError: if the Cumin execution returns a non-zero exit code. """returnself._execute(list(commands),mode='sync',success_threshold=success_threshold,batch_size=batch_size,batch_sleep=batch_sleep,is_safe=is_safe)

[docs]defreboot(self,batch_size:int=1,batch_sleep:Optional[float]=180.0)->None:"""Reboot hosts. Arguments: batch_size (int, optional): how many hosts to reboot in parallel. batch_sleep (float, optional): how long to sleep between one reboot and the next. """iflen(self._hosts)==1:# Temporary workaround until T213296 is fixed.batch_sleep=Nonelogger.info('Rebooting %d hosts in batches of %d with %.1fs of sleep in between: %s',len(self._hosts),batch_size,batch_sleepifbatch_sleepisnotNoneelse0.0,self._hosts)self.run_sync(transports.Command('reboot-host',timeout=30),batch_size=batch_size,batch_sleep=batch_sleep)

[docs]@retry(tries=25,delay=timedelta(seconds=10),backoff_mode='linear',exceptions=(RemoteExecutionError,RemoteCheckError))defwait_reboot_since(self,since:datetime)->None:"""Poll the host until is reachable and has an uptime lower than the provided datetime. Arguments: since (datetime.datetime): the time after which the host should have booted. Raises: spicerack.remote.RemoteCheckError: if unable to connect to the host or the uptime is higher than expected. """remaining=self.hostsdelta=(datetime.utcnow()-since).total_seconds()fornodeset,uptimeinself.uptime():ifuptime>=delta:raiseRemoteCheckError('Uptime for {hosts} higher than threshold: {uptime} > {delta}'.format(hosts=nodeset,uptime=uptime,delta=delta))remaining.difference_update(nodeset)ifremaining:raiseRemoteCheckError('Unable to check uptime from {num} hosts: {hosts}'.format(num=len(remaining),hosts=remaining))logger.info('Found reboot since %s for hosts %s',since,self._hosts)

[docs]defuptime(self)->List[Tuple[NodeSet,float]]:"""Get current uptime. Returns: list: a list of 2-element :py:class:`tuple` instances with hosts :py:class:`ClusterShell.NodeSet.NodeSet` as first item and :py:class:`float` uptime as second item. """results=self.run_sync(transports.Command('cat /proc/uptime',timeout=10),is_safe=True)# Callback to extract the uptime from /proc/uptime (i.e. getting 12345.67 from '12345.67 123456789.00').returnRemoteHosts.results_to_list(results,callback=lambdaoutput:float(output.split()[0]))

[docs]definit_system(self)->List[Tuple[NodeSet,str]]:"""Detect the init system. Returns: list: a list of 2-element tuples with hosts :py:class:`ClusterShell.NodeSet.NodeSet` as first item and the init system :py:class:`str` as second. """results=self.run_sync(transports.Command('ps --no-headers -o comm 1',timeout=10),is_safe=True)returnRemoteHosts.results_to_list(results)

[docs]@staticmethoddefresults_to_list(results:Iterator[Tuple[NodeSet,MsgTreeElem]],callback:Optional[Callable]=None)->List[Tuple[NodeSet,Any]]:"""Extract execution results into a list converting them with an optional callback. Todo: move it directly into Cumin. Arguments: results (generator): generator returned by run_sync() and run_async() to iterate over the results. callback (callable, optional): an optional callable to apply to each result output (it can be multiline). The callback will be called with a the string output as the only parameter and must return the extracted value. The return type can be chosen freely. Returns: list: a list of 2-element tuples with hosts :py:class:`ClusterShell.NodeSet.NodeSet` as first item and the extracted outputs :py:class:`str` as second. This is because NodeSet are not hashable. Raises: spicerack.remote.RemoteError: if unable to run the callback. """extracted=[]fornodeset,outputinresults:result=output.message().decode().strip()ifcallbackisnotNone:try:result=callback(result)exceptExceptionase:raiseRemoteError('Unable to extract data with {cb} for {hosts} from: {output}'.format(cb=callback.__name__,hosts=nodeset,output=result))fromeextracted.append((nodeset,result))returnextracted

def_execute(# pylint: disable=too-many-argumentsself,commands:Sequence[Union[str,Command]],mode:str='sync',success_threshold:float=1.0,batch_size:Optional[Union[int,str]]=None,batch_sleep:Optional[float]=None,is_safe:bool=False)->Iterator[Tuple[NodeSet,MsgTreeElem]]:"""Lower level Cumin's execution of commands on the target nodes. Arguments: commands (list): the list of commands to execute on the target hosts, either a list of commands or a list of cumin.transports.Command instances. mode (str, optional): the Cumin's mode of execution. Accepted values: sync, async. success_threshold (float, optional): to consider the execution successful, must be between 0.0 and 1.0. batch_size (int, str, optional): the batch size for cumin, either as percentage (e.g. ``25%``) or absolute number (e.g. ``5``). batch_sleep (float, optional): the batch sleep in seconds to use in Cumin before scheduling the next host. is_safe (bool, optional): whether the command is safe to run also in dry-run mode because it's a read-only command that doesn't modify the state. Returns: generator: as returned by :py:meth:`cumin.transports.BaseWorker.get_results` to iterate over the results. Raises: RemoteExecutionError: if the Cumin execution returns a non-zero exit code. """ifbatch_sizeisNone:parsed_batch_size={'value':None,'ratio':None}else:parsed_batch_size=target_batch_size(str(batch_size))target=transports.Target(self._hosts,batch_size=parsed_batch_size['value'],batch_size_ratio=parsed_batch_size['ratio'],batch_sleep=batch_sleep)worker=transport.Transport.new(self._config,target)worker.commands=commandsworker.handler=modeworker.success_threshold=success_thresholdlogger.debug('Executing commands %s on %d hosts: %s',commands,len(target.hosts),str(target.hosts))ifself._dry_runandnotis_safe:returniter(())# Empty generator# Temporary workaround until Cumin has full support to suppress output (T212783)# and the Colorama issue that slows down the process is fixed (T217038)stdout=transports.clustershell.sys.stdoutstderr=transports.clustershell.sys.stderrtry:withopen(os.devnull,'w')asdiscard_output:transports.clustershell.sys.stdout=discard_outputtransports.clustershell.sys.stderr=discard_outputret=worker.execute()finally:transports.clustershell.sys.stdout=stdouttransports.clustershell.sys.stderr=stderrifret!=0andnotself._dry_run:raiseRemoteExecutionError(ret,'Cumin execution failed')returnworker.get_results()