This is an archive of the discontinued Mercurial Phabricator instance.

Differential D6925

ci: track spot instance state, randomly assign availability zone
Changes PlannedPublic

Authored by indygreg on Sep 30 2019, 11:57 PM.

Download Raw Diff

Details

Reviewers

None

Group Reviewers

hg-reviewers

Summary

The previous implementation of spot instance requests was too
naive.

I observed the CI system making spot instance requests that were
expiring due to insufficient capacity. And the web UI wasn't
making this obvious.

This commit improves the scheduling of spot instances a little.

First, we randomize the availability zone that the spot instance
request is assigned to. I noticed all spot requests were being
assigned to us-west-2c. Why, I'm not sure. The EC2 docs say
Amazon will assign an availability zone randomly. But it was
always assigning the same zone without capacity. Choosing a
random availability zone seems more robust.

We also update job state accounting to store the spot instance
request ID and the number of spot instance requests. This will
help us inspect the spot instance request after it has been
created (functionality for doing so will be introduced in a
subsequent commit). We also update the execution state to
reflect that a spot instance has been requested. This will give
users more context and can be used to influence behavior should
we want to try launching another instance at a later time.

Diff Detail

Repository

rHG Mercurial

Lint

Lint Skipped

Unit

Unit Tests Skipped

Event Timeline

indygreg created this revision.Sep 30 2019, 11:57 PM

Herald added a reviewer: hg-reviewers. · View Herald TranscriptSep 30 2019, 11:57 PM

Herald added a subscriber: mercurial-devel. · View Herald Transcript

indygreg added a child revision: D6926: ci: retry expired spot instance requests.Sep 30 2019, 11:57 PM

Oops.

indygreg planned changes to this revision.Oct 16 2019, 10:52 PM

Revision Contents
Changeset List

			Path	Packages
M			contrib/ci/lambda_functions/ci.py (47 lines)
M			contrib/ci/lambda_functions/web.py (2 lines)
M			contrib/ci/terraform/job_executor.tf (3 lines)

Commit	Parents	Author	Summary	Date
ab48c96d0845	4c64ebd63909	Gregory Szorc		Sep 30 2019, 10:21 PM

Status	Author	Revision
Changes Planned	indygreg	D6929 ci: add comment about block duration pricing
Changes Planned	indygreg	D6928 ci: fetch explicit attributes
Changes Planned	indygreg	D6927 ci: report cost to run each job
Changes Planned	indygreg	D6926 ci: retry expired spot instance requests
Changes Planned	indygreg	D6925 ci: track spot instance state, randomly assign availability zone
Changes Planned	indygreg	D6924 ci: store job start parameters in DynamoDB
Changes Planned	indygreg	D6922 ci: implement a "try server"
Changes Planned	indygreg	D6906 ci: implement a new CI system for Mercurial

Diff 16719

contrib/ci/lambda_functions/ci.py

	# ci.py - Lambda functions for Mercurial CI			# ci.py - Lambda functions for Mercurial CI
	#			#
	# Copyright 2019 Gregory Szorc <gregory.szorc@gmail.com>			# Copyright 2019 Gregory Szorc <gregory.szorc@gmail.com>
	#			#
	# This software may be used and distributed according to the terms of the			# This software may be used and distributed according to the terms of the
	# GNU General Public License version 2 or any later version.			# GNU General Public License version 2 or any later version.
	# no-check-code because Python 3 native.			# no-check-code because Python 3 native.

	import base64			import base64
	import datetime			import datetime
	import decimal			import decimal
	import json			import json
	import os			import os
				import random
	import time			import time
	import urllib.request			import urllib.request
	import uuid			import uuid

	import boto3			import boto3
	from boto3.dynamodb.conditions import (			from boto3.dynamodb.conditions import (
	Key,			Key,
	)			)
	'schedule_time': schedule_time,			'schedule_time': schedule_time,
	# We encode as JSON because DynamoDB doesn't like some types			# We encode as JSON because DynamoDB doesn't like some types
	# like empty strings.			# like empty strings.
	'start_params': json.dumps({			'start_params': json.dumps({
	'user_data_template': LINUX_USER_DATA,			'user_data_template': LINUX_USER_DATA,
	'user_data_params': user_data_params,			'user_data_params': user_data_params,
	'ec2_instance_launch_config': config,			'ec2_instance_launch_config': config,
	}, sort_keys=True),			}, sort_keys=True),
				'spot_instance_request_count': 0,
	})			})

	print('adding job to pending queue')			print('adding job to pending queue')
	sqs.send_message(			sqs.send_message(
	QueueUrl=sqs_url,			QueueUrl=sqs_url,
	MessageBody=json.dumps({'job_id': job_id})			MessageBody=json.dumps({'job_id': job_id})
	)			)


	def start_pending_job(ec2, job_table, job_id):			def start_pending_job(ec2, job_table, job_id):
	"""Called to request the start of a pending job."""			"""Called to request the start of a pending job."""
	res = job_table.get_item(Key={'job_id': job_id}, ConsistentRead=True)			res = job_table.get_item(Key={'job_id': job_id}, ConsistentRead=True)
	if 'Item' not in res:			if 'Item' not in res:
	print('unable to find job %s' % job_id)			print('unable to find job %s' % job_id)
	return			return

	job = res['Item']			job = res['Item']
				request_spot_instance_for_job(ec2, job_table, job)


				def request_spot_instance_for_job(ec2, job_table, job):
				"""Request a spot instance to start a job."""
				job_id = job['job_id']
				print('requesting spot instance for job %s' % job_id)

				# Fresh job request.
				if job['execution_state'] == 'pending':
				# Pick an availability zone randomly.
				availability_zones = [
				az['ZoneName']
				for az in ec2.describe_availability_zones()['AvailabilityZones']
				if az['State'] == 'available']
				availability_zone = random.choice(availability_zones)
				else:
				print('unhandled execution_state: %s' % job['execution_state'])
				return

	start_params = json.loads(job['start_params'])			start_params = json.loads(job['start_params'])
	user_data_template = start_params['user_data_template']			user_data_template = start_params['user_data_template']
	user_data_params = start_params['user_data_params']			user_data_params = start_params['user_data_params']
	ec2_instance_config = start_params['ec2_instance_launch_config']			ec2_instance_config = start_params['ec2_instance_launch_config']

	user_data = user_data_template.format(**user_data_params)			user_data = user_data_template.format(**user_data_params)

	print('requesting spot instance for job %s' % user_data_params['job_id'])

	launch_spec = dict(ec2_instance_config)			launch_spec = dict(ec2_instance_config)
				launch_spec['Placement'] = {
				'AvailabilityZone': availability_zone,
				}
	launch_spec['UserData'] = base64.b64encode(user_data.encode('utf-8')).decode('utf-8')			launch_spec['UserData'] = base64.b64encode(user_data.encode('utf-8')).decode('utf-8')

	# Spot instances are substantially cheaper but can be terminated at will			# Spot instances are substantially cheaper but can be terminated at will
	# by Amazon. That's fine. We're a CI system. If the instance is terminated,			# by Amazon. That's fine. We're a CI system. If the instance is terminated,
	# we can just retry the job.			# we can just retry the job.
	#			#
	# The max bid price is the on-demand price. So in the typical case we save			# The max bid price is the on-demand price. So in the typical case we save
	# $$$. If we're unlucky we pay the on-demand rate. You can't lose.			# $$$. If we're unlucky we pay the on-demand rate. You can't lose.
	ec2.request_spot_instances(			res = ec2.request_spot_instances(
	BlockDurationMinutes=60,			BlockDurationMinutes=60,
	ValidUntil=datetime.datetime.utcnow() + datetime.timedelta(minutes=10),			ValidUntil=datetime.datetime.utcnow() + datetime.timedelta(minutes=10),
	LaunchSpecification=launch_spec,			LaunchSpecification=launch_spec,
	)			)

				spot_instance_request_id = res['SpotInstanceRequests'][0]['SpotInstanceRequestId']
				print('spot instance request id: %s' % spot_instance_request_id)

				print('recording spot instance state for job %s' % job_id)
				job_table.update_item(
				Key={'job_id': job_id},
				UpdateExpression=(
				'set execution_state = :state, '
				'spot_instance_request_id = :sir, '
				'spot_instance_request_count = spot_instance_request_count + :incr'
				),
				ExpressionAttributeValues={
				':state': 'spot-instance-requested',
				':sir': spot_instance_request_id,
				':incr': 1,
				}
				)


	def react_to_instance_state_change(job_table, instance, state):			def react_to_instance_state_change(job_table, instance, state):
	"""React to a CI worker instance state change."""			"""React to a CI worker instance state change."""
	now = decimal.Decimal(time.time())			now = decimal.Decimal(time.time())

	# CI workers advertise their job info via tags. However, the tags cannot			# CI workers advertise their job info via tags. However, the tags cannot
	# be set for spot instances and are instead encoded in user data. So when			# be set for spot instances and are instead encoded in user data. So when
	# a spot instance starts, detect that here and set the tags so they can be			# a spot instance starts, detect that here and set the tags so they can be

contrib/ci/lambda_functions/web.py

	else:			else:
	fail_count = 'n/a'			fail_count = 'n/a'

	if 'skip_count' in job_info:			if 'skip_count' in job_info:
	skip_count = '%d' % job_info['skip_count']			skip_count = '%d' % job_info['skip_count']
	else:			else:
	skip_count = 'n/a'			skip_count = 'n/a'

	if job_info['execution_state'] in ('pending', 'running'):			if job_info['execution_state'] in ('pending', 'spot-instance-requested', 'running'):
	job_state = job_info['execution_state']			job_state = job_info['execution_state']
	elif job_info['execution_state'] == 'done':			elif job_info['execution_state'] == 'done':
	exit_clean = job_info.get('exit_clean')			exit_clean = job_info.get('exit_clean')
	if exit_clean is None:			if exit_clean is None:
	job_state = 'unknown'			job_state = 'unknown'
	elif exit_clean is True:			elif exit_clean is True:
	job_state = 'completed'			job_state = 'completed'
	elif exit_clean is False:			elif exit_clean is False:

contrib/ci/terraform/job_executor.tf

	statement {			statement {
	effect = "Allow"			effect = "Allow"
	actions = [			actions = [
	"ec2:*",			"ec2:*",
	"iam:*",			"iam:*",
	]			]
	resources = ["*"]			resources = ["*"]
	}			}
	# Allow querying job state in DynamoDB.			# Allow querying and updating job state in DynamoDB.
	statement {			statement {
	effect = "Allow"			effect = "Allow"
	actions = [			actions = [
	"dynamodb:GetItem",			"dynamodb:GetItem",
				"dynamodb:UpdateItem",
	]			]
	resources = [			resources = [
	aws_dynamodb_table.ci_job.arn,			aws_dynamodb_table.ci_job.arn,
	]			]
	}			}
	}			}

	resource "aws_iam_role_policy" "lambda_ci_handle_pending_job" {			resource "aws_iam_role_policy" "lambda_ci_handle_pending_job" {