Commit 5ecd3d9a authored by Cameron McFarland's avatar Cameron McFarland

Merge branch 'cmcfarland/snowplow' into 'master'

Adding SnowPlow in AWS as an environment.

See merge request !800
parents 4e4e7e93 bd69fde5
# Terraform for the AWS SnowPlow Pipeline
This configuration uses the following AWS services to host SnowPlow. There
may be more in use, but these are the primary services.
1. EC2 (Auto Scaling Groups, Launch Configurations, ELB, Target Groups,
Security Groups)
1. Kinesis (Streams and Firehose)
1. Lambda
1. DynamoDB
1. IAM (Policies and Roles)
1. S3
1. VPC (Subnets, VPC, Internet Gateways, Routes, Routing Tables)
## Design Document
If you want to know more about the SnowPlow infrastructure, please consult the
[design document](https://about.gitlab.com/handbook/engineering/infrastructure/design/snowplow/).
## SnowPlow Installs and Configs
There are two types of SnowPlow nodes (Collectors and Enrichers) and they are
all configured and installed via user-data in the launch configurations.
## Kinesis Streams
Kinesis is how SnowPlow hands off data from collector to enricher to s3loader.
* snowplow-raw-good
* snowplow-raw-bad
* snowplow-enriched-good
* snowplow-enriched-bad
## Kinesis Firehose and Lambda
Kinesis Firehose takes events from a stream and applies a Lambda function
to the event, then write it into the S3 bucket.
## Lambda Function
Firehose uses a Lambda function to format events written to S3. The Lambda
function code is in the file ```lambda/lambda_function.py```. As of this
writing, all this function does is append a newline to the end of each event
before it is written to S3.
The AWS provider for Terraform requires a zip file of this code to update or
create the Lambda function. There is a data archive object in the config with
the name ```snowplow_lambda_event_formatter_archive``` that build the zip file
from the function python script. For now, the zip contains a single file (the
lambda_function.py file) with no directory structure.
If the hash of that file changes, terraform will try to update the function.
It's possible that the zip file hash changes, but no code changes were made.
This is ok to replace on the fly in the Lambda config.
## DynamoDB
The enricher and s3loader nodes use DynamoDB to track Kinesis state. Normally
these tables would be allocated by Terraform, but if the nodes themselves don't
create the tables, it did not seem to work properly. Therefore, access to the
tables is controlled by roles and policies, but the tables are managed by the
SnowPlow nodes that need them. If the table needs to be created, the nodes will
do that on their own.
* SnowplowEnrich-gitlab-us-east-1
* SnowplowS3Loader-gitlab-us-east-1
## Launch Config Changes and Production Instances
Updating the launch config will apply to new systems coming up in the
auto-scaling group. But existing EC2 instances won't be changed. You will
have to rotate them manually to have them replaced.
### SSL Certificate for Load Balancer
This is referenced as an ARN to the cert in AWS. We're not going to put the
private key in TF, so this will have to remain as an ARN reference.
from __future__ import print_function
import base64
print('Loading function')
def lambda_handler(event, context):
output = []
for record in event['records']:
print(record['recordId'])
payload = base64.b64decode(record['data'])
processed = payload + "\n"
output_record = {
'recordId': record['recordId'],
'result': 'Ok',
'data': base64.b64encode(processed)
}
output.append(output_record)
print('Successfully processed {} records.'.format(len(event['records'])))
return {'records': output}
This diff is collapsed.
#!/bin/bash
## This has, so far, been written to run on Amazon Linux 2 AMI.
## Install Java 1.8
yum -y install java-1.8.0-openjdk
## Set up user, group, and install location
groupadd snowplow
adduser --system --gid snowplow snowplow
mkdir -p /snowplow
mkdir -p /snowplow/config
mkdir -p /snowplow/logs
## Install SnowPlow Kinesis Collector
mkdir -p /tmp/build
cd /tmp/build
wget -q http://dl.bintray.com/snowplow/snowplow-generic/snowplow_scala_stream_collector_kinesis_0.15.0.zip
unzip -d /snowplow/bin snowplow_scala_stream_collector_kinesis_0.15.0.zip
cd /tmp
rm -rf /tmp/build
## Need to copy in a config
cat > /snowplow/config/collector.hocon <<EOF
collector {
interface = "0.0.0.0"
port = "8000"
production = true
p3p {
policyRef = "/w3c/p3p.xml"
CP = "NOI DSP COR NID PSA OUR IND COM NAV STA"
}
crossDomain {
enabled = true
domains = [ "*" ]
secure = true
}
cookie {
enabled = false
expiration = "365 days"
name = "snwplw"
domain = "gitlab.sinter-collect.com"
}
doNotTrackCookie {
enabled = false
name = "COLLECTOR_DO_NOT_TRACK_COOKIE_NAME"
value = "COLLECTOR_DO_NOT_TRACK_COOKIE_VALUE"
}
cookieBounce {
enabled = false
name = "n3pc"
fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000"
forwardedProtocolHeader = "X-Forwarded-Proto"
}
redirectMacro {
enabled = false
}
cors {
accessControlMaxAge = 5 seconds
}
rootResponse {
enabled = false
statusCode = 302
}
prometheusMetrics {
enabled = false
}
streams {
good = "snowplow-raw-good"
bad = "snowplow-raw-bad"
useIpAddressAsPartitionKey = true
sink {
enabled = kinesis
region = "us-east-1"
threadPoolSize = 10
aws {
accessKey = iam
secretKey = iam
}
backoffPolicy {
minBackoff = 10
maxBackoff = 300000
}
}
buffer {
byteLimit = 16384
recordLimit = 1000
timeLimit = 10000
}
}
}
akka {
loglevel = OFF
loggers = ["akka.event.slf4j.Slf4jLogger"]
http.server {
remote-address-header = on
raw-request-uri-header = on
parsing {
max-uri-length = 32768
uri-parsing-mode = relaxed
}
}
}
EOF
chown -R snowplow:snowplow /snowplow
## Star the collector service
su snowplow -g snowplow -c 'nohup /usr/bin/java -jar /snowplow/bin/snowplow-stream-collector-kinesis-0.15.0.jar --config /snowplow/config/collector.hocon > /snowplow/logs/out.log 2>&1 &'
#!/bin/bash
## This has, so far, been written to run on Amazon Linux 2 AMI.
## Install Java 1.8
yum -y install java-1.8.0-openjdk
## Set up user, group, and install location
groupadd snowplow
adduser --system --gid snowplow snowplow
mkdir -p /snowplow
mkdir -p /snowplow/config
mkdir -p /snowplow/enrichments
mkdir -p /snowplow/logs
## Install SnowPlow Kinesis Collector
mkdir -p /tmp/build
cd /tmp/build
wget -q http://dl.bintray.com/snowplow/snowplow-generic/snowplow_stream_enrich_kinesis_0.21.0.zip
unzip -d /snowplow/bin snowplow_stream_enrich_kinesis_0.21.0.zip
cd /tmp
rm -rf /tmp/build
## We need an IGLU Resolver config
cat > /snowplow/config/iglu_resolver.json <<EOJ
{
"schema": "iglu:com.snowplowanalytics.iglu/resolver-config/jsonschema/1-0-1",
"data": {
"cacheSize": 500,
"repositories": [
{
"name": "Iglu Central",
"priority": 0,
"vendorPrefixes": [ "com.snowplowanalytics" ],
"connection": {
"http": {
"uri": "http://iglucentral.com"
}
}
},
{
"name": "Iglu Central - GCP Mirror",
"priority": 1,
"vendorPrefixes": [ "com.snowplowanalytics" ],
"connection": {
"http": {
"uri": "http://mirror01.iglucentral.com"
}
}
}
]
}
}
EOJ
## IP Lookup Encrichment
cat > /snowplow/enrichments/ip_lookups.json <<EOR
{
"schema": "iglu:com.snowplowanalytics.snowplow/ip_lookups/jsonschema/2-0-0",
"data": {
"name": "ip_lookups",
"vendor": "com.snowplowanalytics.snowplow",
"enabled": true,
"parameters": {
"geo": {
"database": "GeoLite2-City.mmdb",
"uri": "http://snowplow-hosted-assets.s3.amazonaws.com/third-party/maxmind"
}
}
}
}
EOR
## user_agent_utils_config Enrichment
cat > /snowplow/enrichments/user_agent_utils_config.json <<EOA
{
"schema": "iglu:com.snowplowanalytics.snowplow/user_agent_utils_config/jsonschema/1-0-0",
"data": {
"vendor": "com.snowplowanalytics.snowplow",
"name": "user_agent_utils_config",
"enabled": true,
"parameters": {}
}
}
EOA
## Need to copy in a config
cat > /snowplow/config/enricher.hocon <<EOF
enrich {
production = true
streams {
in {
raw = "snowplow-raw-good"
}
out {
enriched = "snowplow-enriched-good"
bad = "snowplow-enriched-bad"
partitionKey = "user_ipaddress"
}
sourceSink {
enabled = kinesis
aws {
accessKey = iam
secretKey = iam
}
region = "us-east-1"
maxRecords = 10000
initialPosition = TRIM_HORIZON
backoffPolicy {
minBackoff = 10
maxBackoff = 300000
}
}
buffer {
byteLimit = 16384
recordLimit = 1000
timeLimit = 10000
}
appName = "SnowplowEnrich-gitlab-us-east-1"
}
}
EOF
chown -R snowplow:snowplow /snowplow
## Star the collector service
su snowplow -g snowplow -c 'nohup /usr/bin/java -jar /snowplow/bin/snowplow-stream-enrich-kinesis-0.21.0.jar --config /snowplow/config/enricher.hocon --enrichments file:/snowplow/enrichments --resolver file:/snowplow/config/iglu_resolver.json > /snowplow/logs/out.log 2>&1 &'
{
"Version" : "2012-10-17",
"Id" : "Policy1560181228695",
"Statement" : [
{
"Sid" : "Stmt1560181207940",
"Effect" : "Allow",
"Principal" : {
"AWS" : "arn:aws:iam::855262394183:user/datateam-snowplow-ro"
},
"Action" : "s3:ListBucket",
"Resource" : "arn:aws:s3:::gitlab-com-snowplow-events"
},
{
"Sid" : "Stmt1560181227007",
"Effect" : "Allow",
"Principal" : {
"AWS" : "arn:aws:iam::855262394183:user/datateam-snowplow-ro"
},
"Action" : "s3:GetObject",
"Resource" : "arn:aws:s3:::gitlab-com-snowplow-events/*"
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"cloudwatch:PutMetricData"
],
"Resource": [
"*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"kinesis:*"
],
"Resource": [
"*"
]
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"cloudwatch:PutMetricData"
],
"Resource": [
"*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"dynamodb:*"
],
"Resource": [
"arn:aws:dynamodb:us-east-1:855262394183:table/SnowplowEnrich-gitlab-us-east-1"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"kinesis:*"
],
"Resource": [
"*"
]
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"glue:GetTableVersions"
],
"Resource": "*"
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"s3:AbortMultipartUpload",
"s3:GetBucketLocation",
"s3:GetObject",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::gitlab-com-snowplow-events",
"arn:aws:s3:::gitlab-com-snowplow-events/*",
"arn:aws:s3:::%FIREHOSE_BUCKET_NAME%",
"arn:aws:s3:::%FIREHOSE_BUCKET_NAME%/*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"lambda:InvokeFunction",
"lambda:GetFunctionConfiguration"
],
"Resource": "arn:aws:lambda:us-east-1:855262394183:function:SnowPlowFirehoseFormatter:$LATEST"
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:us-east-1:855262394183:log-group:/aws/kinesisfirehose/SnowPlowEnrichedBad:log-stream:*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"kinesis:DescribeStream",
"kinesis:GetShardIterator",
"kinesis:GetRecords"
],
"Resource": "arn:aws:kinesis:us-east-1:855262394183:stream/snowplow-enriched-bad"
},
{
"Effect": "Allow",
"Action": [
"kms:Decrypt"
],
"Resource": [
"arn:aws:kms:us-east-1:855262394183:key/%SSE_KEY_ID%"
],
"Condition": {
"StringEquals": {
"kms:ViaService": "kinesis.us-east-1.amazonaws.com"
},
"StringLike": {
"kms:EncryptionContext:aws:kinesis:arn": "arn:aws:kinesis:us-east-1:855262394183:stream/snowplow-enriched-bad"
}
}
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"glue:GetTableVersions"
],
"Resource": "*"
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"s3:AbortMultipartUpload",
"s3:GetBucketLocation",
"s3:GetObject",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::gitlab-com-snowplow-events",
"arn:aws:s3:::gitlab-com-snowplow-events/*",
"arn:aws:s3:::%FIREHOSE_BUCKET_NAME%",
"arn:aws:s3:::%FIREHOSE_BUCKET_NAME%/*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"lambda:InvokeFunction",
"lambda:GetFunctionConfiguration"
],
"Resource": "arn:aws:lambda:us-east-1:855262394183:function:SnowPlowFirehoseFormatter:$LATEST"
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:us-east-1:855262394183:log-group:/aws/kinesisfirehose/SnowPlowEnrichedGood:log-stream:*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"kinesis:DescribeStream",
"kinesis:GetShardIterator",
"kinesis:GetRecords"
],
"Resource": "arn:aws:kinesis:us-east-1:855262394183:stream/snowplow-enriched-good"
},
{
"Effect": "Allow",
"Action": [
"kms:Decrypt"
],
"Resource": [
"arn:aws:kms:us-east-1:855262394183:key/%SSE_KEY_ID%"
],
"Condition": {
"StringEquals": {
"kms:ViaService": "kinesis.us-east-1.amazonaws.com"
},
"StringLike": {
"kms:EncryptionContext:aws:kinesis:arn": "arn:aws:kinesis:us-east-1:855262394183:stream/snowplow-enriched-good"
}
}
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"glue:GetTableVersions"
],
"Resource": "*"
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"s3:AbortMultipartUpload",
"s3:GetBucketLocation",
"s3:GetObject",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
"s3:PutObject"
],
"Resource": [
"arn:aws:s3:::gitlab-com-snowplow-events",
"arn:aws:s3:::gitlab-com-snowplow-events/*",
"arn:aws:s3:::%FIREHOSE_BUCKET_NAME%",
"arn:aws:s3:::%FIREHOSE_BUCKET_NAME%/*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"lambda:InvokeFunction",
"lambda:GetFunctionConfiguration"
],
"Resource": "arn:aws:lambda:us-east-1:855262394183:function:SnowPlowFirehoseFormatter:$LATEST"
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:us-east-1:855262394183:log-group:/aws/kinesisfirehose/SnowPlowRawBad:log-stream:*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"kinesis:DescribeStream",
"kinesis:GetShardIterator",
"kinesis:GetRecords"
],
"Resource": "arn:aws:kinesis:us-east-1:855262394183:stream/snowplow-raw-bad"
},
{
"Effect": "Allow",
"Action": [
"kms:Decrypt"
],
"Resource": [
"arn:aws:kms:us-east-1:855262394183:key/%SSE_KEY_ID%"
],
"Condition": {
"StringEquals": {
"kms:ViaService": "kinesis.us-east-1.amazonaws.com"
},
"StringLike": {
"kms:EncryptionContext:aws:kinesis:arn": "arn:aws:kinesis:us-east-1:855262394183:stream/snowplow-raw-bad"
}
}
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "logs:CreateLogGroup",
"Resource": "arn:aws:logs:us-east-1:855262394183:*"
},
{
"Effect": "Allow",
"Action": [
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:us-east-1:855262394183:log-group:/aws/lambda/SnowPlowFirehoseFormatter:*"
]
}
]
}
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"cloudwatch:PutMetricData"
],
"Resource": [
"*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"s3:*"
],