Files
feeddeck/supabase/functions/_shared/feed/medium_test.ts
Rico Berger 9e59439226 [core] Add Tests for Sources (#98)
This commit adds tests for all available sources.

This commit also fixes the parsing of Atom feeds for the RSS source,
where the `dc:date` field must be used for the `publishedAt` field.
2023-12-12 18:50:29 +01:00

378 lines
86 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { assertEquals } from 'std/assert';
import { createClient } from '@supabase/supabase-js';
import {
assertSpyCall,
assertSpyCalls,
returnsNext,
stub,
} from 'std/testing/mock';
import { ISource } from '../models/source.ts';
import { IProfile } from '../models/profile.ts';
import {
faviconFilter,
getMediumFeed,
isMediumUrl,
parseMediumOption,
} from './medium.ts';
import { utils } from '../utils/index.ts';
import { feedutils } from './utils/index.ts';
const supabaseClient = createClient('http://localhost:54321', 'test123');
const mockProfile: IProfile = {
id: '',
tier: 'free',
createdAt: 0,
updatedAt: 0,
};
const mockSource: ISource = {
id: '',
columnId: 'mycolumn',
userId: 'myuser',
type: 'medium',
title: '',
};
const responseTag = `<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:cc="http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
<channel>
<title><![CDATA[Kubernetes on Medium]]></title>
<description><![CDATA[Latest stories tagged with Kubernetes on Medium]]></description>
<link>https://medium.com/tag/kubernetes/latest?source=rss------kubernetes-5</link>
<image>
<url>https://cdn-images-1.medium.com/proxy/1*TGH72Nnw24QL3iV9IOm4VA.png</url>
<title>Kubernetes on Medium</title>
<link>https://medium.com/tag/kubernetes/latest?source=rss------kubernetes-5</link>
</image>
<generator>Medium</generator>
<lastBuildDate>Tue, 05 Dec 2023 18:26:48 GMT</lastBuildDate>
<atom:link href="https://medium.com/feed/tag/kubernetes" rel="self" type="application/rss+xml" />
<webMaster><![CDATA[yourfriends@medium.com]]></webMaster>
<atom:link href="http://medium.superfeedr.com" rel="hub" />
<item>
<title><![CDATA[Securing the Cloud: A Comprehensive Guide to Building a Kubernetes Threat Model for Enhanced…]]></title>
<description><![CDATA[<div class="medium-feed-item"><p class="medium-feed-image"><a href="https://blackcatdev.medium.com/securing-the-cloud-a-comprehensive-guide-to-building-a-kubernetes-threat-model-for-enhanced-e695e24db604?source=rss------kubernetes-5"><img src="https://cdn-images-1.medium.com/max/2600/0*HDPbjcSA7DITNbFp" width="4000"></a></p><p class="medium-feed-snippet">In the ever-evolving landscape of cloud computing and container orchestration, Kubernetes has emerged as a dominant force, providing a&#x2026;</p><p class="medium-feed-link"><a href="https://blackcatdev.medium.com/securing-the-cloud-a-comprehensive-guide-to-building-a-kubernetes-threat-model-for-enhanced-e695e24db604?source=rss------kubernetes-5">Continue reading on Medium »</a></p></div>]]></description>
<link>https://blackcatdev.medium.com/securing-the-cloud-a-comprehensive-guide-to-building-a-kubernetes-threat-model-for-enhanced-e695e24db604?source=rss------kubernetes-5</link>
<guid isPermaLink="false">https://medium.com/p/e695e24db604</guid>
<category><![CDATA[security]]></category>
<category><![CDATA[threat-modeling]]></category>
<category><![CDATA[kubernetes]]></category>
<category><![CDATA[cybersecurity]]></category>
<category><![CDATA[threat-model]]></category>
<dc:creator><![CDATA[BlackCatDev]]></dc:creator>
<pubDate>Tue, 05 Dec 2023 18:18:40 GMT</pubDate>
<atom:updated>2023-12-05T18:18:40.249Z</atom:updated>
</item>
<item>
<title><![CDATA[Fortifying Kubernetes Deployments: A Comprehensive Guide to Securing CI/CD Pipelines]]></title>
<description><![CDATA[<div class="medium-feed-item"><p class="medium-feed-image"><a href="https://blackcatdev.medium.com/fortifying-kubernetes-deployments-a-comprehensive-guide-to-securing-ci-cd-pipelines-096aacb27637?source=rss------kubernetes-5"><img src="https://cdn-images-1.medium.com/max/2600/0*XZum3wouH1vlsKV9" width="7952"></a></p><p class="medium-feed-snippet"># Securing CI/CD Pipelines for Kubernetes Deployments</p><p class="medium-feed-link"><a href="https://blackcatdev.medium.com/fortifying-kubernetes-deployments-a-comprehensive-guide-to-securing-ci-cd-pipelines-096aacb27637?source=rss------kubernetes-5">Continue reading on Medium »</a></p></div>]]></description>
<link>https://blackcatdev.medium.com/fortifying-kubernetes-deployments-a-comprehensive-guide-to-securing-ci-cd-pipelines-096aacb27637?source=rss------kubernetes-5</link>
<guid isPermaLink="false">https://medium.com/p/096aacb27637</guid>
<category><![CDATA[developer]]></category>
<category><![CDATA[devsecops]]></category>
<category><![CDATA[security]]></category>
<category><![CDATA[kubernetes]]></category>
<category><![CDATA[ci-cd-pipeline]]></category>
<dc:creator><![CDATA[BlackCatDev]]></dc:creator>
<pubDate>Tue, 05 Dec 2023 18:17:03 GMT</pubDate>
<atom:updated>2023-12-05T18:17:03.957Z</atom:updated>
</item>
</channel>
</rss>`;
const responseUser = `<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:cc="http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
<channel>
<title><![CDATA[Stories by Yuri Shkuro on Medium]]></title>
<description><![CDATA[Stories by Yuri Shkuro on Medium]]></description>
<link>https://medium.com/@YuriShkuro?source=rss-cff2e4ba6058------2</link>
<image>
<url>https://cdn-images-1.medium.com/fit/c/150/150/0*4Ljs60hv_8hpF_mj.jpg</url>
<title>Stories by Yuri Shkuro on Medium</title>
<link>https://medium.com/@YuriShkuro?source=rss-cff2e4ba6058------2</link>
</image>
<generator>Medium</generator>
<lastBuildDate>Tue, 05 Dec 2023 20:19:35 GMT</lastBuildDate>
<atom:link href="https://medium.com/@YuriShkuro/feed" rel="self" type="application/rss+xml" />
<webMaster><![CDATA[yourfriends@medium.com]]></webMaster>
<atom:link href="http://medium.superfeedr.com" rel="hub" />
<item>
<title><![CDATA[Experiment: Migrating OpenTracing-based application in Go to use the OpenTelemetry SDK]]></title>
<link>https://medium.com/jaegertracing/experiment-migrating-opentracing-based-application-in-go-to-use-the-opentelemetry-sdk-29b09fe2fbc4?source=rss-cff2e4ba6058------2</link>
<guid isPermaLink="false">https://medium.com/p/29b09fe2fbc4</guid>
<category><![CDATA[opentelemetry]]></category>
<category><![CDATA[migration]]></category>
<category><![CDATA[jaegertracing]]></category>
<category><![CDATA[opentracing]]></category>
<dc:creator><![CDATA[Yuri Shkuro]]></dc:creator>
<pubDate>Thu, 09 Feb 2023 05:44:06 GMT</pubDate>
<atom:updated>2023-02-09T05:46:24.711Z</atom:updated>
<content:encoded><![CDATA[<p>TL;DR: This post explains how Jaegers 🚗 HotROD 🚗 app was migrated to the OpenTelemetry SDK.</p><p>Jaegers <a href="https://www.jaegertracing.io/docs/1.42/getting-started/#sample-app-hotrod">HotROD demo</a> has been around for a few years. It was written with OpenTracing-based instrumentation, including a couple of OSS libraries for HTTP and gRPC middleware, and used Jaegers native SDK for Go, <a href="https://github.com/jaegertracing/jaeger-client-go">jaeger-client-go</a>. The latter was deprecated in 2022, so we had a choice to either convert all of the HotROD apps instrumentation to OpenTelemetry, or try the OpenTracing-bridge, which is a required part of every OpenTelemetry API / SDK. The bridge is an adapter layer that wraps an OpenTelemetry Tracer in a facade to makes it look like the OpenTracing Tracer. This way we can use the OpenTelemetry SDK in an application like HotROD that only understands the OpenTracing API.</p><p>I wanted to try the bridge solution, to minimize the code changes in the application. It is not the most efficient way, since an adapter layer incurs some performance overhead, but for a demo app it seemed like a reasonable trade-off.</p><p>The code can be found in the <a href="https://github.com/jaegertracing/jaeger/tree/fedeb4cab75399e4672b77efe6a067a7bd148ddf/examples/hotrod">Jaeger repository</a> (at specific commit hash).</p><h3>Setup</h3><p>First, we need to initialize the OpenTelemetry SDK and create an OpenTracing Bridge. Fortunately, I did not have to start from scratch, because there was an earlier <a href="https://github.com/jaegertracing/jaeger/pull/3390">pull request #3390</a> by <a href="https://github.com/rbroggi">@rbroggi</a>, which I picked up to make certain improvements. Initialization happens in pkg/tracing/init.go :</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/ba947995089d22c1c24f68f2202ac1b7/href">https://medium.com/media/ba947995089d22c1c24f68f2202ac1b7/href</a></iframe><p>In the beginning, this is a pretty vanilla OpenTelemetry SDK initialization. We create an exporter using a helper function (more on it below), and build a TracerProvider. The compliant OpenTelemetry instrumentation would use this provider to create named Tracer objects as needed, usually with distinct names reflecting the instrumentation library or the application component. However, the OpenTracing API did not have the concept of named tracers, its Tracer as a singleton, so here we create a Tracer with a blank name (in line 23) and pass it to the bridge factory that wraps it and returns an OpenTracing Tracer.</p><p>Side note: in a better-organized code there would also be some sort of close/shutdown function returned so that the caller of tracing.Init could gracefully shutdown the tracer, e.g. to flush the span buffers when stopping the application.</p><p>The original PR used the Jaeger exporter that lets the SDK export data in the Jaegers native data format. However, last year we extended Jaeger to accept OpenTelemetrys OTLP format directly, so I decided to add a bit of flexibility and make the choice of the exporter configurable:</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/67ffc7651db87b6891d18b107029b92b/href">https://medium.com/media/67ffc7651db87b6891d18b107029b92b/href</a></iframe><h3>Broken Traces</h3><p>At this point things should have started to work. However, the resulting traces looked like this:</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*b-RsMFPGcTtpxKaL2oRMEg.png" /><figcaption>Trace with many spans, but all coming from a single service frontend.</figcaption></figure><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*0XHZYX0Xwe32mMnQLdbH3w.png" /><figcaption>Another part of the workflow captured as a different trace. It looks like there are two services here, but in fact the HotROD app simulates the sql and redis database services, its not actually making RPC calls to them.</figcaption></figure><p>Instead of one trace per request we are getting several disjoined traces. This is where <a href="https://github.com/rbroggi">@rbroggi</a>s PR got stuck. After some debugging I came to realize that the SDK defaults to a no-op propagation, so no trace context was sent in RPC requests between services, resulting in multiple disjoined traces for the same workflow. It was easy to fix, but it felt like an unnecessary friction in using the OpenTelemetry SDK. I also added the Baggage propagator, which we will discuss later.</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/aaceccd029d20a9df3628341bfea94da/href">https://medium.com/media/aaceccd029d20a9df3628341bfea94da/href</a></iframe><p>Since the Init() function is called many times by different services in the HotROD app, I only set the propagator once using sync.Once.</p><p>After this change, the traces looked better, more colorful, so I committed the change.</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*_DzcdeHiRp6JOG2FqA3bfg.png" /><figcaption>A better-looking trace after “fixing” the propagation.</figcaption></figure><h3>Traces Still Broken</h3><p>However, I shouldve paid better attention. Notice the lone span in the middle called /driver.DriverService/FindNearest. Lets take a closer look:</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*XZ1VM84sCFU0kjFbkiI5ig.png" /><figcaption>A client span trying to make a gRPC request to service driver.</figcaption></figure><p>This span is a client-side of a gRPC request from frontend service to driver service. The latter is missing from the trace! This was a different issue with the context propagation. There was an error returned when trying to inject the context into the request headers. The instrumentation actually logged the error back into the client span, which we can see in the Logs section: Invalid Inject/Extract carrier. Unfortunately, it was difficult to spot this error without opening up the span, because the RPC itself was successful, and the instrumentation was correct in not setting the error=true span tag, which wouldve shown in the Jaeger UI as a red icon.</p><p>After a bit more digging I found the issue, which was due to a bug in the OpenTelemetry SDKs bridge implementation. You can read about it in the following GitHub issue.</p><p><a href="https://github.com/open-telemetry/opentelemetry-go/issues/3678">[opentracing] OT Bridge does not work with OT gRPC instrumentation · Issue #3678 · open-telemetry/opentelemetry-go</a></p><p>As of this writing, the fix if still waiting to be merged, so as a workaround I made a branch of opentracing-contrib/go-grpc and changed it to use TextMap propagation instead of HTTPHeaders, which by chance happened to work with the bridge code.</p><p>With these fixes, we were back to the “classic” HotROD traces.</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*s26BJwM-r_pP97TZ_4Nk-w.png" /><figcaption>Full HotROD trace, as the AI overlords intended.</figcaption></figure><h3>RPC Metrics</h3><p>I was ready to call it a day, but there was one piece missing. The original Jaeger SDK initialization code had one extra featureit was enabling the collection of RPC metrics from spans (supported only by the Go SDK in Jaeger). My original blog post, <a href="https://medium.com/opentracing/take-opentracing-for-a-hotrod-ride-f6e3141f7941">Take OpenTracing for a HotROD ride</a>, had a discussion about it, so it was a shame to lose this during this upgrade. If I were upgrading to the OpenTelemetry instrumentation as well, it might have contained a metrics-oriented instrumentation, although it would somewhat miss the point of the blog post that tracing instrumentation is already sufficient in this case. Another possibility is to generate metrics from spans using a special processor in the OpenTelemetry Collector, but using the Collector is not part of the HotROD demo setup.</p><p>The OpenTelemetry SDK has the notion of span processors, an abstract API invoked on all finished spans. It is similar to how the RPCMetricsObserver was implemented in the jaeger-client-go, so I did what any scrappy engineer would docopy &amp; paste the code from jaeger-client-go directly into the HotROD code and adopt it to implement otel.SpanProcessor. And voilà:</p><pre>$ curl http://127.0.0.1:8083/debug/vars | grep &#39;&quot;requests.endpoint_HTTP&#39;<br>&quot;requests.endpoint_HTTP_GET_/.error_false&quot;: 3,<br>&quot;requests.endpoint_HTTP_GET_/.error_true&quot;: 0,<br>&quot;requests.endpoint_HTTP_GET_/config.error_false&quot;: 4,<br>&quot;requests.endpoint_HTTP_GET_/config.error_true&quot;: 0,<br>&quot;requests.endpoint_HTTP_GET_/customer.error_false&quot;: 4,<br>&quot;requests.endpoint_HTTP_GET_/customer.error_true&quot;: 0,<br>&quot;requests.endpoint_HTTP_GET_/debug/vars.error_false&quot;: 5,<br>&quot;requests.endpoint_HTTP_GET_/debug/vars.error_true&quot;: 0,<br>&quot;requests.endpoint_HTTP_GET_/dispatch.error_false&quot;: 4,<br>&quot;requests.endpoint_HTTP_GET_/dispatch.error_true&quot;: 0,<br>&quot;requests.endpoint_HTTP_GET_/route.error_false&quot;: 40,<br>&quot;requests.endpoint_HTTP_GET_/route.error_true&quot;: 0,</pre><h3>Baggage</h3><p>As I was looking through the metrics in HotROD, I realized there was another area I neglected. These sections in the expvar output were not supposed to be empty:</p><pre>$ curl http://127.0.0.1:8083/debug/vars | grep route.calc.by<br>&quot;route.calc.by.customer.sec&quot;: {},<br>&quot;route.calc.by.session.sec&quot;: {}</pre><p>These measures require baggage to work. The term “baggage” was introduced in the academia (<a href="https://www.usenix.org/conference/atc16/technical-sessions/presentation/mace">Jonathan Mace <em>et al., </em>SOSP 2015 Best Paper Award</a>). It refers to a <a href="https://medium.com/jaegertracing/embracing-context-propagation-7100b9b6029a">general-purpose context propagation mechanism</a>, which can be used to carry both the tracing context and any other contextual metadata across the distributed workflow execution. The HotROD app demonstrates a number of capabilities that require baggage propagation, and they were all completely broken after upgrading to OpenTelemetry SDK 😭.</p><p>The first thing that broke was propagation of baggage from the web UI. HotROD does not start the trace in the browser, only in the backend. The Jaeger SDK had a feature that allowed it to accept baggage from the incoming request even when there was no incoming tracing context. Internally the Jaeger SDK achieved this by returning an “invalid” SpanContext from the Extract method where the trace ID / span ID were blank, but the baggage was present. Digging through the OpenTracing Bridge code I found that it returns an error in this case. This could probably be fixed there, but I decided to add a workaround directly to HotROD where I used the OpenTelemetrys Baggage propagator to extract the baggage from the request manually and then copy it into the span.</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/ab6775e1bef5ed4e4561486174c6754b/href">https://medium.com/media/ab6775e1bef5ed4e4561486174c6754b/href</a></iframe><p>I trimmed down the code example above a bit to only show relevant parts. The otelBaggageExtractor function creates a middleware that manually extracts the baggage into the current Context. Then the instrumentation library nethttp is given a span observer (invoked after the server span is created) which copies the baggage from the context into the span. This functionality is only needed at the root span, because once the trace context is propagated through the workflow, the Bridge correctly propagates the baggage as well (remember that I registered Baggage propagator as a global propagator in the Init function, as shown in the earlier code snippet). I was actually pleasantly surprised that the maintainers were able to achieve that, because the OpenTracing API operates purely on Span objects, not on the Context, while in OpenTelemetry the baggage is carried in the Context, a lower logical level.</p><p>One other small change I had to make was to change the web UI to use the baggage header (per W3C standard), instead of the jaeger-baggage header that was recognized by the Jaeger SDK.</p><p>Strictly speaking, these were all the changes I had to make to the HotROD code to make the baggage work. Yet, it didnt work. Some baggage values were correctly propagated, but others were missing. After more digging I found several places where it was silently dropped on the floor because of some (misplaced, in my opinion) validations in the baggage and the bridge/opentracing packages in the OpenTelemetry SDK. The ticket below explains the issue in more details.</p><p><a href="https://github.com/open-telemetry/opentelemetry-go/issues/3685">Baggage not working with OpenTracing Bridge · Issue #3685 · open-telemetry/opentelemetry-go</a></p><p>Running against a patched version of OpenTelemetry SDK yielded the desired behavior and the baggage-reliant functionality was restored. I was getting performance metrics grouped by baggage values:</p><pre>$ curl http://127.0.0.1:8083/debug/vars | grep route.calc.by<br>&quot;route.calc.by.customer.sec&quot;: {<br> &quot;Amazing Coffee Roasters&quot;: 0.9080000000000004, <br> &quot;Japanese Desserts&quot;: 1.0490000000000002, <br> &quot;Rachel&#39;s Floral Designs&quot;: 1.0090000000000003, <br> &quot;Trom Chocolatier&quot;: 1.0000000000000004<br>},<br>&quot;route.calc.by.session.sec&quot;: {<br> &quot;2885&quot;: 1.4760000000000002, <br> &quot;5161&quot;: 2.4899999999999993<br>}</pre><p>And the mutex instrumentation was able to capture IDs of multiple transactions in the queue (see the <a href="https://medium.com/opentracing/take-opentracing-for-a-hotrod-ride-f6e3141f7941">original blog post</a> for explanation of this one):</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*lxeVafTIXsC7PVzkaOLIhA.png" /><figcaption>Logs show a transactions blocked on three other transactions.</figcaption></figure><h3>Summary</h3><p>Overall, the migration required fairly minimal amount of changes to the code, mostly because I chose to reuse the existing OpenTracing instrumentation and only swap the SDK from Jaeger to OpenTelemetry. The most friction with the migration was due to a couple of bugs in the OpenTelemetry Bridge code (and likely one place in the baggage package). This only leads me to believe that the baggage functionality is not yet widely used, especially when someone uses the OpenTracing instrumentation with a bridge to OpenTelemetry, so it is likely I just ran into a bunch of the early adopter issues.</p><p>At this point I am interested in taking the next step and doing a full migration of HotROD to OpenTelemetry (or help reviewing if someone wants to volunteer!) It could make a complementary Part 2 to this post to describe how that goes.</p><p>There is also a possible Part 3 involving a no-less interesting migration to the OpenTelemetry Metrics. Right now all of the Jaeger code base is using an internal abstraction for metrics backed by the Prometheus SDK.</p><p>Stay tuned.</p><img src="https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=29b09fe2fbc4" width="1" height="1" alt=""><hr><p><a href="https://medium.com/jaegertracing/experiment-migrating-opentracing-based-application-in-go-to-use-the-opentelemetry-sdk-29b09fe2fbc4">Experiment: Migrating OpenTracing-based application in Go to use the OpenTelemetry SDK</a> was originally published in <a href="https://medium.com/jaegertracing">JaegerTracing</a> on Medium, where people are continuing the conversation by highlighting and responding to this story.</p>]]></content:encoded>
</item>
<item>
<title><![CDATA[Better alignment with OpenTelemetry by focusing on OTLP]]></title>
<link>https://medium.com/jaegertracing/better-alignment-with-opentelemetry-by-focusing-on-otlp-f3688939073f?source=rss-cff2e4ba6058------2</link>
<guid isPermaLink="false">https://medium.com/p/f3688939073f</guid>
<category><![CDATA[opentelemetry]]></category>
<dc:creator><![CDATA[Yuri Shkuro]]></dc:creator>
<pubDate>Thu, 03 Nov 2022 18:12:55 GMT</pubDate>
<atom:updated>2022-11-03T18:12:55.688Z</atom:updated>
<content:encoded><![CDATA[<p>TL;DR: proposal (and a <a href="https://forms.gle/aUuJg5DQwNzncJ4s8">survey</a>) to deprecate native Jaeger exporters in OpenTelemetry SDKs in favor of OTLP exporters.</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*iFJFYZsdPvaFuwaAoZ1HRQ.jpeg" /><figcaption>Photo by <a href="https://unsplash.com/@miquel_parera_mila?utm_source=unsplash&amp;utm_medium=referral&amp;utm_content=creditCopyText">Miquel Parera</a> on <a href="https://unsplash.com/?utm_source=unsplash&amp;utm_medium=referral&amp;utm_content=creditCopyText">Unsplash</a></figcaption></figure><p>This is a re-post from the <a href="https://opentelemetry.io/blog/2022/jaeger-native-otlp/">OpenTelemetry blog article</a>.</p><p>By <a href="https://github.com/breedx-splk"><strong>Jason Plumb</strong></a><strong> (Splunk)</strong> | Thursday, November 03, 2022</p><p>Back in May of 2022, the Jaeger project <a href="https://medium.com/jaegertracing/introducing-native-support-for-opentelemetry-in-jaeger-eb661be8183c">announced native support for the OpenTelemetry Protocol</a> (OTLP). This followed a <a href="https://twitter.com/YuriShkuro/status/1455170693197402119">generous deprecation cycle</a> for the Jaeger client libraries across many languages. With these changes, OpenTelemetry users are now able to send traces into Jaeger with industry-standard OTLP, and the Jaeger client library repositories have been finally archived.</p><p>We intend to <strong>deprecate Jaeger exporters from OpenTelemetry</strong> in the near future, and are looking for your feedback to determine the length of the deprecation phase. The best way to provide feedback is by <a href="https://forms.gle/aUuJg5DQwNzncJ4s8">filling out a 4-question survey</a> or commenting on <a href="https://github.com/open-telemetry/opentelemetry-specification/pull/2858">the existing draft pull request</a>.</p><h3>OpenTelemetry Support</h3><p>This interoperability is a wonderful victory both for Jaeger users and for OpenTelemetry users. However, were not done yet. The OpenTelemetry specification still requires support for Jaeger client exporters across languages.</p><p>This causes challenges for both Jaeger users and OpenTelemetry maintainers:</p><ol><li><strong>Confusing Choices: </strong>Currently, users are faced with a choice of exporter (Jaeger or OTLP), and this can be a source of confusion. A user might be inclined, when exporting telemetry to Jaeger, to simply choose the Jaeger exporter because the name matches (even though Jaeger now actively encourages the use of OTLP).<br>If we can eliminate this potentially confusing choice, we can improve the user experience and continue standardizing on a single interoperable protocol. We love it when things “just work” out of the box!</li><li><strong>Maintenance and duplication: </strong>Because the Jaeger client libraries are now archived, they will not receive updates (including security patches). To continue properly supporting Jaeger client exporters, OpenTelemetry authors would be required to re-implement some of the functionality it had previously leveraged from the Jaeger clients.<br>Now that Jaeger supports OTLP, this feels like a step backwards: It results in an increased maintenance burden with very little benefit.</li></ol><h3>User Impact</h3><p>The proposal is to deprecate the following exporters from OpenTelemetry in favor of using native OTLP into Jaeger:</p><ul><li>Jaeger Thrift over HTTP</li><li>Jaeger Protobuf via gRPC</li><li>Jaeger Thrift over UDP</li></ul><p>In addition to application configuration changes, there could be other architectural considerations. HTTP and gRPC should be straightforward replacements, although it may require exposing ports 4317 and 4318 if they are not already accessible.</p><p>Thrift over UDP implies the use of the <a href="https://www.jaegertracing.io/docs/1.24/architecture/#agent">Jaeger Agent</a>. Users with this deployment configuration will need to make a slightly more complicated change, typically one of the following:</p><ol><li>Direct ingest. Applications will change from using Thrift+UDP to sending OTLP traces directly to their jaeger-collector instance. This may also have sampling implications.</li><li>Replacing the Jaeger Agent with a sidecar <a href="https://github.com/open-telemetry/opentelemetry-collector">OpenTelemetry Collector</a> instance. This could have sampling implications and requires changes to your infrastructure deployment code.</li></ol><h3>Intent to DeprecateWed Like Your Feedback!</h3><p>In order to better support users and the interop between OpenTelemetry and Jaeger, we intend to deprecate and eventually remove support for Jaeger client exporters / Jaeger native data format in OpenTelemetry.</p><p>We would like your feedback! We want to hear from users who could be impacted by this change. To better make a data-informed decision, <a href="https://forms.gle/aUuJg5DQwNzncJ4s8">we have put together a short 4-question survey</a>.</p><p>Your input will help us to choose how long to deprecate before removal.</p><p>A <a href="https://github.com/open-telemetry/opentelemetry-specification/pull/2858">draft PR has been created in the specification</a> to support this deprecation. If would like to contribute and provide feedback, visit the link above and add some comments. We want to hear from you.</p><img src="https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=f3688939073f" width="1" height="1" alt=""><hr><p><a href="https://medium.com/jaegertracing/better-alignment-with-opentelemetry-by-focusing-on-otlp-f3688939073f">Better alignment with OpenTelemetry by focusing on OTLP</a> was originally published in <a href="https://medium.com/jaegertracing">JaegerTracing</a> on Medium, where people are continuing the conversation by highlighting and responding to this story.</p>]]></content:encoded>
</item>
<item>
<title><![CDATA[TEMPLE: Six Pillars of Observability]]></title>
<link>https://medium.com/@YuriShkuro/temple-six-pillars-of-observability-4ac3e3deb402?source=rss-cff2e4ba6058------2</link>
<guid isPermaLink="false">https://medium.com/p/4ac3e3deb402</guid>
<category><![CDATA[observability]]></category>
<dc:creator><![CDATA[Yuri Shkuro]]></dc:creator>
<pubDate>Mon, 19 Sep 2022 01:51:28 GMT</pubDate>
<atom:updated>2022-10-05T01:29:48.705Z</atom:updated>
<content:encoded><![CDATA[<figure><img alt="A temple in Italy with six front pillars" src="https://cdn-images-1.medium.com/max/1024/1*1yZ4WP2IDrpFNpuf7tKkhQ.png" /><figcaption>Valley of the Temples, Agrigento, AG, Italy. Photo by <a href="https://unsplash.com/@belial90?utm_source=unsplash&amp;utm_medium=referral&amp;utm_content=creditCopyText">Dario Crisafulli</a> on <a href="https://unsplash.com/?utm_source=unsplash&amp;utm_medium=referral&amp;utm_content=creditCopyText">Unsplash</a>.</figcaption></figure><p>In the past few years, much has been talked and written about the “three pillars of observability”: metrics, logs, and traces. A Google search for the phrase brings up over 7,000 results, with almost every observability vendor having a blog post or an e-book on the topic. Recently, the term MELT started showing up that adds “events” to the mix as a distinct telemetry signal. In this post, I want to show that there are even more distinct types and introduce TEMPLE, which stands for traces, events, metrics, profiles, logs, and exceptions. I call them <em>six pillars of observability</em>, on one hand to make fun of the previous terms and acronyms, but also to make the case that these signals serve distinct use cases for observability of cloud-native systems. If I fail at the latter and you dont buy my arguments, at least the TEMPLE acronym works much better with “pillars” 😉.</p><p>Attribution: One of my colleagues at Meta, Henry Bond, started using the acronym TEMPL in the internal documents. I added “Exceptions”, for completeness, and ended up with TEMPLE.</p><h3>Six Pillars Explained</h3><p>I will try to illustrate why I think these six telemetry types deserve to be considered as separate. It does not mean some of them cannot be supported by the same backend, but they differ in the following aspects:</p><ul><li>How each telemetry type is produced</li><li>Which unique storage requirements they impose</li><li>How they are used in the user workflows</li></ul><p>Even though the TEMPLE acronym implies a certain ordering of the signals, I do not ascribe any meaning to that other than to make up a pleasant word. For better continuity of the explanation, I will go through them in a different order.</p><p>Also, naming is hard. I will point out how, surprisingly, most of the terms we use in this space are ambiguous, and the boundaries between telemetry types are not as strict as they appear to be.</p><p><strong>Metrics, the original pillar</strong>. Numerical measurements with attributes, which are easily aggregatable both spacially (along the attribute dimensions) and temporally (combining values into less discrete time intervals). Metrics aggregates remain highly accurate, which makes them great for monitoring, but aggregations lose the original level of details, which makes metrics not as good for troubleshooting &amp; investigations.</p><p>In the context of cloud native applications, metrics usually refer to <em>operational metrics</em> of the software. <em>Business metrics</em> are actually a different category that is better captured via structured logs.</p><p><strong>Logs, the ancient pillar</strong> (if you question which came first, “ancient” or “original”, take it up with Marvel). Logs are a confusing category, ranging from arbitrary printf-like statements (aka unstructured logs) to highly structured and even schematized events. When structured logs are schematized (cf. <a href="https://research.facebook.com/publications/positional-paper-schema-first-application-telemetry/">Schema-first Application Telemetry</a>), they are often sent to different tables in the warehouse and used for analytics, including business analytics. Schema-free structured logs are what Honeycomb calls “arbitrarily-wide events”. Its a bit of a misnomer, because each individual log record is not “arbitrarily wide”, in fact it usually has a completely static shape that will not change in the given call site, but when we mix these log records from different call sites we can expect all kinds of shapes, making the resulting destination an “arbitrarily wide” table. Most modern logging backends can ingest structured logs and allow search and analytics on these arbitrary dimensions.</p><p>Many logs are generated in response to a service processing specific input requests, i.e. they are <em>request-scoped</em>. We, as an industry, havent really figured out how best to deal with request-scoped logs. On one hand, they look like other logs and can be stored and analyzed in a similar fashion. On the other hand, distributed tracing is specifically designed as request-scoped logging, so these logs could be much more useful when viewed in the context of a distributed trace. I met engineers whose teams chose to use tracing APIs exclusively to capture logs, so that those can be always visualized in the context of traces. A common approach to solve this is to capture trace ID as a field in the logs and build cross-correlation between logging and tracing tools.</p><p><strong>Traces, the “new cool kid on the block” pillar</strong>. The term <em>tracing</em> is quite overloaded (just look at <a href="https://docs.kernel.org/trace/index.html">Linux Tracing</a> documentation). In the context of cloud native observability, tracing usually refers to <em>distributed tracing</em>, or a special form of structured logs that are <em>request-scoped</em>, or more generally <em>workflow-centric</em>. In contrast to plain structured logging, tracing captures not only the log records, but also <em>causality</em> between them, and once those records are stitched into a trace they represent the trajectory of a single request (or a workflow) through a distributed system end-to-end.</p><p>Tracing opens up a realm of possibilities for reasoning about a system:</p><ul><li>Unique monitoring capabilities, e.g., an end-to-end latency of messaging workflows, which is difficult to observe with any other telemetry.</li><li>Debugging capabilities, in particular, root cause isolation. Traces may not always tell you why a system is misbehaving, but they often can narrow down which of the thousands of distributed components is at fault.</li><li>Resource usage attribution by different caller profiles or business lines.</li></ul><p><strong>Events, the misunderstood pillar</strong>. This is perhaps the worst-named category of telemetry signals because, strictly speaking, pretty much all telemetry is “events”. What people usually mean by this category is <em>change events</em>, i.e., events that are external to the observed system that cause some changes in that system. The most common examples are: deployments of application code (and the corresponding code commits), configuration changes, experiments, DR-related traffic drains, perhaps auto-scaling events, etc.</p><p>There is no practical bound to what could be considered an event that affects the system. For instance, in the early days of Uber, Halloween was the night of the highest user traffic, such that SREs would spend the whole night in the “war room”, monitoring the system and firefighting (Big Bang Theory flashback: Howard: “guidance system for drunk people”, Raj: “they already had that, its called Uber”). As the business became more global, the impact of Halloween as US-centric holiday became less pronounced on the system traffic, but one can easily see how holidays, or some other public events like sports or concerts, can become factors that affect the behavior of a system and might be useful to show to the operators as part of the systems observability.</p><p>One could reasonably ask: why cant we treat events simply as structured logs? As far as the data shape in the storage, there is indeed not much difference. However, logs usually require less rigor from the backends capturing them: some level of data loss may be acceptable, and the pipelines are often set up to down-sample or throttle the logs. For example, if a bug is causing an application to log a certain error message, its likely that well have many similar logs, so its not critical to guarantee that every one of them is stored. This is very different from the handling of change events, which should be all stored reliably, because if we miss the record about that one single code deployment that caused the issue and needs to be rolled back, our outage investigation might take much longer. Similarly, when querying for logs, its usually sufficient to find some samples of a pattern, or to get aggregate statistics, there is not much emphasis on finding a very specific individual log record. But with change events, were precisely looking for very specific instances. Finally, change events are usually produced in much fewer volumes than logs. These differences in the requirements often lead to different designs and trade-offs in the logs and events backends.</p><p><strong>Profiles, the geek pillar</strong>. Profiles are another category of telemetry that is tricky to define, although you would know it when you see one. Profiles are just being introduced as a new signal to the OpenTelemetry scope in the <a href="https://github.com/open-telemetry/oteps/pull/212">OTEP-212</a>, and even that document had a bit of a tough time defining what a profile is. Its latest definition is “a collection of stack traces with some metric associated with each stack trace, typically representing the number of times that stack trace was encountered”. Mouthful.</p><p>Many engineers encounter profiling tasks at some point, but from my experience most of them do not have to deal with profiles very often, unless they specialize in performance and efficiency optimizations. Profiling tools, as a result, tend to be somewhat esoteric, focusing on power users.</p><p>Profiles, unlike most other telemetry types, almost never require explicit instrumentation, instead relying on deeper integration with the runtimes to capture the call stacks. They often generate very large amounts of data, requiring specially designed backends.</p><p><strong>Exceptions, the forgotten pillar</strong>. Finally, lets not forget the exceptions. Remember the “Stacktrace or GTFO” comics? When I first came across it, I was working at an investment bank developing trading systems, and we had zero observability into the system running in production (we could get access to logs on the hosts, but only by going through a special permissions escalation process, because … lawyers). So the comics resonated with me a lot at the time. But years later, my attitude changed to “couldnt he just use Sentry or something?”</p><p>My first experience with Sentry was by accident. We just integrated Jaeger SDK into a Python framework that was widely used at Uber. Next morning I am getting a UBN ticket (“UnBreakNow”, i.e., high urgency) that says they are getting errors in production and the stacktrace points to Jaeger SDK code. But instead of a stacktrace the ticket had a link to Sentry, which was the open source system Uber deployed to capture and aggregate exceptions. I was blown away by the amount of information captured by Raven (Sentrys SDK) besides the stacktrace itself. The most useful was the ability to inspect values of all local variables at every frame of the stack. That immediately revealed the root cause, which had to do with the handling of utf-8 strings.</p><p>Exceptions are, strictly speaking, a specialized form of structured logs, although you may need much more structure than the typical structured logging API allows (like nested collections, etc.) The processing pipelines for exceptions are also pretty specialized: they often involve symbolication, fingerprinting, stacks pruning, and deduplication. Finally, the UI for viewing this data is also highly customized to this data source. All these factors lead me to conclude that exceptions should really be treated as an independent telemetry type.</p><h3>Pillars are not Observability</h3><p>Now that we covered the six pillars, its worth remembering that pillars do not guarantee observability, which is defined, perhaps counterintuitively, as the ability to <em>understand</em> the system (s internal state) from its outputs, not just to <em>observe the outputs</em>. These pillars are just different types of telemetry that can be produced, the raw data. To be effective in investigations, the observability platform needs to be able to combine these signals into solutions for specific workflows. Even with free-form investigations, where you are “on your own” because all the guided investigation workflows failed, the platform can provide many features to assist you, such as understanding the metadata of the telemetry and allowing cross-telemetry correlations, or automation of insights and pattern recognition. Pillars are what you build upon, not the end goal.</p><h3>Takeaways</h3><ol><li>Stop saying “three pillars”. There are more than three.</li><li>Start saying TEMPLE, if you must name them.</li><li>Dont take it seriously. The boundaries are diffuse.</li><li>Pillars ≠ observability, they are just data.</li></ol><img src="https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=4ac3e3deb402" width="1" height="1" alt="">]]></content:encoded>
</item>
</channel>
</rss>`;
Deno.test('isMediumUrl', () => {
assertEquals(isMediumUrl('https://acceldataio.medium.com'), true);
assertEquals(isMediumUrl('https://medium.com/tag/kubernetes'), true);
assertEquals(isMediumUrl('https://medium.com/jaegertracing'), true);
assertEquals(isMediumUrl('https://medium.com/@YuriShkuro'), true);
assertEquals(isMediumUrl('https://www.google.de/'), false);
});
Deno.test('parseMediumOption', () => {
assertEquals(
parseMediumOption('#kubernetes'),
'https://medium.com/feed/tag/kubernetes',
);
assertEquals(
parseMediumOption('@YuriShkuro'),
'https://medium.com/feed/@YuriShkuro',
);
assertEquals(
parseMediumOption('https://medium.com/feed/tag/kubernetes'),
'https://medium.com/feed/tag/kubernetes',
);
assertEquals(
parseMediumOption('https://medium.com/feed/@YuriShkuro'),
'https://medium.com/feed/@YuriShkuro',
);
assertEquals(
parseMediumOption('https://acceldataio.medium.com'),
'https://acceldataio.medium.com/feed',
);
assertEquals(
parseMediumOption('https://acceldataio.medium.com/feed'),
'https://acceldataio.medium.com/feed',
);
assertEquals(
parseMediumOption('https://medium.com/jaegertracing'),
'https://medium.com/feed/jaegertracing',
);
assertEquals(
parseMediumOption('https://medium.com/feed/jaegertracing'),
'https://medium.com/feed/jaegertracing',
);
});
Deno.test('getMediumFeed - Tag', async () => {
const fetchWithTimeoutSpy = stub(
utils,
'fetchWithTimeout',
returnsNext([
new Promise((resolve) => {
resolve(new Response(responseTag, { status: 200 }));
}),
]),
);
const getFaviconSpy = stub(
feedutils,
'getFavicon',
returnsNext([
new Promise((resolve) => {
resolve(undefined);
}),
]),
);
try {
const { source, items } = await getMediumFeed(
supabaseClient,
undefined,
mockProfile,
{ ...mockSource, options: { medium: '#kubernetes' } },
);
feedutils.assertEqualsSource(source, {
id: 'medium-myuser-mycolumn-40f28b0a56743a117745ac7dfd785111',
columnId: 'mycolumn',
userId: 'myuser',
type: 'medium',
title: 'Kubernetes on Medium',
options: { 'medium': 'https://medium.com/feed/tag/kubernetes' },
link:
'https://medium.com/tag/kubernetes/latest?source=rss------kubernetes-5',
});
feedutils.assertEqualsItems(items, [{
id:
'medium-myuser-mycolumn-40f28b0a56743a117745ac7dfd785111-6286c43a67f72950c40bd4f537b66ed4',
userId: 'myuser',
columnId: 'mycolumn',
sourceId: 'medium-myuser-mycolumn-40f28b0a56743a117745ac7dfd785111',
title:
'Securing the Cloud: A Comprehensive Guide to Building a Kubernetes Threat Model for Enhanced…',
link:
'https://blackcatdev.medium.com/securing-the-cloud-a-comprehensive-guide-to-building-a-kubernetes-threat-model-for-enhanced-e695e24db604?source=rss------kubernetes-5',
media: 'https://cdn-images-1.medium.com/max/2600/0*HDPbjcSA7DITNbFp',
description:
'<div class="medium-feed-item"><p class="medium-feed-image"><a href="https://blackcatdev.medium.com/securing-the-cloud-a-comprehensive-guide-to-building-a-kubernetes-threat-model-for-enhanced-e695e24db604?source=rss------kubernetes-5"><img src="https://cdn-images-1.medium.com/max/2600/0*HDPbjcSA7DITNbFp" width="4000"></a></p><p class="medium-feed-snippet">In the ever-evolving landscape of cloud computing and container orchestration, Kubernetes has emerged as a dominant force, providing a&#x2026;</p><p class="medium-feed-link"><a href="https://blackcatdev.medium.com/securing-the-cloud-a-comprehensive-guide-to-building-a-kubernetes-threat-model-for-enhanced-e695e24db604?source=rss------kubernetes-5">Continue reading on Medium »</a></p></div>',
author: 'BlackCatDev',
publishedAt: 1701800320,
}, {
id:
'medium-myuser-mycolumn-40f28b0a56743a117745ac7dfd785111-3ee71eda8d5f6978bb8b813f6ba34608',
userId: 'myuser',
columnId: 'mycolumn',
sourceId: 'medium-myuser-mycolumn-40f28b0a56743a117745ac7dfd785111',
title:
'Fortifying Kubernetes Deployments: A Comprehensive Guide to Securing CI/CD Pipelines',
link:
'https://blackcatdev.medium.com/fortifying-kubernetes-deployments-a-comprehensive-guide-to-securing-ci-cd-pipelines-096aacb27637?source=rss------kubernetes-5',
media: 'https://cdn-images-1.medium.com/max/2600/0*XZum3wouH1vlsKV9',
description:
'<div class="medium-feed-item"><p class="medium-feed-image"><a href="https://blackcatdev.medium.com/fortifying-kubernetes-deployments-a-comprehensive-guide-to-securing-ci-cd-pipelines-096aacb27637?source=rss------kubernetes-5"><img src="https://cdn-images-1.medium.com/max/2600/0*XZum3wouH1vlsKV9" width="7952"></a></p><p class="medium-feed-snippet"># Securing CI/CD Pipelines for Kubernetes Deployments</p><p class="medium-feed-link"><a href="https://blackcatdev.medium.com/fortifying-kubernetes-deployments-a-comprehensive-guide-to-securing-ci-cd-pipelines-096aacb27637?source=rss------kubernetes-5">Continue reading on Medium »</a></p></div>',
author: 'BlackCatDev',
publishedAt: 1701800223,
}]);
} finally {
fetchWithTimeoutSpy.restore();
getFaviconSpy.restore();
}
assertSpyCall(fetchWithTimeoutSpy, 0, {
args: ['https://medium.com/feed/tag/kubernetes', { method: 'get' }, 5000],
returned: new Promise((resolve) => {
resolve(new Response(responseTag, { status: 200 }));
}),
});
assertSpyCall(getFaviconSpy, 0, {
args: [
'https://medium.com/tag/kubernetes/latest?source=rss------kubernetes-5',
faviconFilter,
],
returned: new Promise((resolve) => {
resolve(undefined);
}),
});
assertSpyCalls(fetchWithTimeoutSpy, 1);
assertSpyCalls(getFaviconSpy, 1);
});
Deno.test('getMediumFeed - User', async () => {
const fetchWithTimeoutSpy = stub(
utils,
'fetchWithTimeout',
returnsNext([
new Promise((resolve) => {
resolve(new Response(responseUser, { status: 200 }));
}),
]),
);
const getFaviconSpy = stub(
feedutils,
'getFavicon',
returnsNext([
new Promise((resolve) => {
resolve(undefined);
}),
]),
);
try {
const { source, items } = await getMediumFeed(
supabaseClient,
undefined,
mockProfile,
{ ...mockSource, options: { medium: '@YuriShkuro' } },
);
feedutils.assertEqualsSource(source, {
id: 'medium-myuser-mycolumn-77107c9209365c6c6c601a9f018a05f4',
columnId: 'mycolumn',
userId: 'myuser',
type: 'medium',
title: 'Stories by Yuri Shkuro on Medium',
options: { medium: 'https://medium.com/feed/@YuriShkuro' },
link: 'https://medium.com/@YuriShkuro?source=rss-cff2e4ba6058------2',
});
feedutils.assertEqualsItems(items, [{
id:
'medium-myuser-mycolumn-77107c9209365c6c6c601a9f018a05f4-7219a04f5c706f5e6c9e4af2f1060fe4',
userId: 'myuser',
columnId: 'mycolumn',
sourceId: 'medium-myuser-mycolumn-77107c9209365c6c6c601a9f018a05f4',
title:
'Experiment: Migrating OpenTracing-based application in Go to use the OpenTelemetry SDK',
link:
'https://medium.com/jaegertracing/experiment-migrating-opentracing-based-application-in-go-to-use-the-opentelemetry-sdk-29b09fe2fbc4?source=rss-cff2e4ba6058------2',
media:
'https://cdn-images-1.medium.com/max/1024/1*b-RsMFPGcTtpxKaL2oRMEg.png',
description:
'<p>TL;DR: This post explains how Jaegers 🚗 HotROD 🚗 app was migrated to the OpenTelemetry SDK.</p><p>Jaegers <a href="https://www.jaegertracing.io/docs/1.42/getting-started/#sample-app-hotrod">HotROD demo</a> has been around for a few years. It was written with OpenTracing-based instrumentation, including a couple of OSS libraries for HTTP and gRPC middleware, and used Jaegers native SDK for Go, <a href="https://github.com/jaegertracing/jaeger-client-go">jaeger-client-go</a>. The latter was deprecated in 2022, so we had a choice to either convert all of the HotROD apps instrumentation to OpenTelemetry, or try the OpenTracing-bridge, which is a required part of every OpenTelemetry API / SDK. The bridge is an adapter layer that wraps an OpenTelemetry Tracer in a facade to makes it look like the OpenTracing Tracer. This way we can use the OpenTelemetry SDK in an application like HotROD that only understands the OpenTracing API.</p><p>I wanted to try the bridge solution, to minimize the code changes in the application. It is not the most efficient way, since an adapter layer incurs some performance overhead, but for a demo app it seemed like a reasonable trade-off.</p><p>The code can be found in the <a href="https://github.com/jaegertracing/jaeger/tree/fedeb4cab75399e4672b77efe6a067a7bd148ddf/examples/hotrod">Jaeger repository</a> (at specific commit hash).</p><h3>Setup</h3><p>First, we need to initialize the OpenTelemetry SDK and create an OpenTracing Bridge. Fortunately, I did not have to start from scratch, because there was an earlier <a href="https://github.com/jaegertracing/jaeger/pull/3390">pull request #3390</a> by <a href="https://github.com/rbroggi">@rbroggi</a>, which I picked up to make certain improvements. Initialization happens in pkg/tracing/init.go :</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/ba947995089d22c1c24f68f2202ac1b7/href">https://medium.com/media/ba947995089d22c1c24f68f2202ac1b7/href</a></iframe><p>In the beginning, this is a pretty vanilla OpenTelemetry SDK initialization. We create an exporter using a helper function (more on it below), and build a TracerProvider. The compliant OpenTelemetry instrumentation would use this provider to create named Tracer objects as needed, usually with distinct names reflecting the instrumentation library or the application component. However, the OpenTracing API did not have the concept of named tracers, its Tracer as a singleton, so here we create a Tracer with a blank name (in line 23) and pass it to the bridge factory that wraps it and returns an OpenTracing Tracer.</p><p>Side note: in a better-organized code there would also be some sort of close/shutdown function returned so that the caller of tracing.Init could gracefully shutdown the tracer, e.g. to flush the span buffers when stopping the application.</p><p>The original PR used the Jaeger exporter that lets the SDK export data in the Jaegers native data format. However, last year we extended Jaeger to accept OpenTelemetrys OTLP format directly, so I decided to add a bit of flexibility and make the choice of the exporter configurable:</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/67ffc7651db87b6891d18b107029b92b/href">https://medium.com/media/67ffc7651db87b6891d18b107029b92b/href</a></iframe><h3>Broken Traces</h3><p>At this point things should have started to work. However, the resulting traces looked like this:</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*b-RsMFPGcTtpxKaL2oRMEg.png" /><figcaption>Trace with many spans, but all coming from a single service frontend.</figcaption></figure><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*0XHZYX0Xwe32mMnQLdbH3w.png" /><figcaption>Another part of the workflow captured as a different trace. It looks like there are two services here, but in fact the HotROD app simulates the sql and redis database services, its not actually making RPC calls to them.</figcaption></figure><p>Instead of one trace per request we are getting several disjoined traces. This is where <a href="https://github.com/rbroggi">@rbroggi</a>s PR got stuck. After some debugging I came to realize that the SDK defaults to a no-op propagation, so no trace context was sent in RPC requests between services, resulting in multiple disjoined traces for the same workflow. It was easy to fix, but it felt like an unnecessary friction in using the OpenTelemetry SDK. I also added the Baggage propagator, which we will discuss later.</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/aaceccd029d20a9df3628341bfea94da/href">https://medium.com/media/aaceccd029d20a9df3628341bfea94da/href</a></iframe><p>Since the Init() function is called many times by different services in the HotROD app, I only set the propagator once using sync.Once.</p><p>After this change, the traces looked better, more colorful, so I committed the change.</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*_DzcdeHiRp6JOG2FqA3bfg.png" /><figcaption>A better-looking trace after “fixing” the propagation.</figcaption></figure><h3>Traces Still Broken</h3><p>However, I shouldve paid better attention. Notice the lone span in the middle called /driver.DriverService/FindNearest. Lets take a closer look:</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*XZ1VM84sCFU0kjFbkiI5ig.png" /><figcaption>A client span trying to make a gRPC request to service driver.</figcaption></figure><p>This span is a client-side of a gRPC request from frontend service to driver service. The latter is missing from the trace! This was a different issue with the context propagation. There was an error returned when trying to inject the context into the request headers. The instrumentation actually logged the error back into the client span, which we can see in the Logs section: Invalid Inject/Extract carrier. Unfortunately, it was difficult to spot this error without opening up the span, because the RPC itself was successful, and the instrumentation was correct in not setting the error=true span tag, which wouldve shown in the Jaeger UI as a red icon.</p><p>After a bit more digging I found the issue, which was due to a bug in the OpenTelemetry SDKs bridge implementation. You can read about it in the following GitHub issue.</p><p><a href="https://github.com/open-telemetry/opentelemetry-go/issues/3678">[opentracing] OT Bridge does not work with OT gRPC instrumentation · Issue #3678 · open-telemetry/opentelemetry-go</a></p><p>As of this writing, the fix if still waiting to be merged, so as a workaround I made a branch of opentracing-contrib/go-grpc and changed it to use TextMap propagation instead of HTTPHeaders, which by chance happened to work with the bridge code.</p><p>With these fixes, we were back to the “classic” HotROD traces.</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*s26BJwM-r_pP97TZ_4Nk-w.png" /><figcaption>Full HotROD trace, as the AI overlords intended.</figcaption></figure><h3>RPC Metrics</h3><p>I was ready to call it a day, but there was one piece missing. The original Jaeger SDK initialization code had one extra featureit was enabling the collection of RPC metrics from spans (supported only by the Go SDK in Jaeger). My original blog post, <a href="https://medium.com/opentracing/take-opentracing-for-a-hotrod-ride-f6e3141f7941">Take OpenTracing for a HotROD ride</a>, had a discussion about it, so it was a shame to lose this during this upgrade. If I were upgrading to the OpenTelemetry instrumentation as well, it might have contained a metrics-oriented instrumentation, although it would somewhat miss the point of the blog post that tracing instrumentation is already sufficient in this case. Another possibility is to generate metrics from spans using a special processor in the OpenTelemetry Collector, but using the Collector is not part of the HotROD demo setup.</p><p>The OpenTelemetry SDK has the notion of span processors, an abstract API invoked on all finished spans. It is similar to how the RPCMetricsObserver was implemented in the jaeger-client-go, so I did what any scrappy engineer would docopy & paste the code from jaeger-client-go directly into the HotROD code and adopt it to implement otel.SpanProcessor. And voilà:</p><pre>$ curl http://127.0.0.1:8083/debug/vars | grep \'"requests.endpoint_HTTP\'<br>"requests.endpoint_HTTP_GET_/.error_false": 3,<br>"requests.endpoint_HTTP_GET_/.error_true": 0,<br>"requests.endpoint_HTTP_GET_/config.error_false": 4,<br>"requests.endpoint_HTTP_GET_/config.error_true": 0,<br>"requests.endpoint_HTTP_GET_/customer.error_false": 4,<br>"requests.endpoint_HTTP_GET_/customer.error_true": 0,<br>"requests.endpoint_HTTP_GET_/debug/vars.error_false": 5,<br>"requests.endpoint_HTTP_GET_/debug/vars.error_true": 0,<br>"requests.endpoint_HTTP_GET_/dispatch.error_false": 4,<br>"requests.endpoint_HTTP_GET_/dispatch.error_true": 0,<br>"requests.endpoint_HTTP_GET_/route.error_false": 40,<br>"requests.endpoint_HTTP_GET_/route.error_true": 0,</pre><h3>Baggage</h3><p>As I was looking through the metrics in HotROD, I realized there was another area I neglected. These sections in the expvar output were not supposed to be empty:</p><pre>$ curl http://127.0.0.1:8083/debug/vars | grep route.calc.by<br>"route.calc.by.customer.sec": {},<br>"route.calc.by.session.sec": {}</pre><p>These measures require baggage to work. The term “baggage” was introduced in the academia (<a href="https://www.usenix.org/conference/atc16/technical-sessions/presentation/mace">Jonathan Mace <em>et al., </em>SOSP 2015 Best Paper Award</a>). It refers to a <a href="https://medium.com/jaegertracing/embracing-context-propagation-7100b9b6029a">general-purpose context propagation mechanism</a>, which can be used to carry both the tracing context and any other contextual metadata across the distributed workflow execution. The HotROD app demonstrates a number of capabilities that require baggage propagation, and they were all completely broken after upgrading to OpenTelemetry SDK 😭.</p><p>The first thing that broke was propagation of baggage from the web UI. HotROD does not start the trace in the browser, only in the backend. The Jaeger SDK had a feature that allowed it to accept baggage from the incoming request even when there was no incoming tracing context. Internally the Jaeger SDK achieved this by returning an “invalid” SpanContext from the Extract method where the trace ID / span ID were blank, but the baggage was present. Digging through the OpenTracing Bridge code I found that it returns an error in this case. This could probably be fixed there, but I decided to add a workaround directly to HotROD where I used the OpenTelemetrys Baggage propagator to extract the baggage from the request manually and then copy it into the span.</p><iframe src="" width="0" height="0" frameborder="0" scrolling="no"><a href="https://medium.com/media/ab6775e1bef5ed4e4561486174c6754b/href">https://medium.com/media/ab6775e1bef5ed4e4561486174c6754b/href</a></iframe><p>I trimmed down the code example above a bit to only show relevant parts. The otelBaggageExtractor function creates a middleware that manually extracts the baggage into the current Context. Then the instrumentation library nethttp is given a span observer (invoked after the server span is created) which copies the baggage from the context into the span. This functionality is only needed at the root span, because once the trace context is propagated through the workflow, the Bridge correctly propagates the baggage as well (remember that I registered Baggage propagator as a global propagator in the Init function, as shown in the earlier code snippet). I was actually pleasantly surprised that the maintainers were able to achieve that, because the OpenTracing API operates purely on Span objects, not on the Context, while in OpenTelemetry the baggage is carried in the Context, a lower logical level.</p><p>One other small change I had to make was to change the web UI to use the baggage header (per W3C standard), instead of the jaeger-baggage header that was recognized by the Jaeger SDK.</p><p>Strictly speaking, these were all the changes I had to make to the HotROD code to make the baggage work. Yet, it didnt work. Some baggage values were correctly propagated, but others were missing. After more digging I found several places where it was silently dropped on the floor because of some (misplaced, in my opinion) validations in the baggage and the bridge/opentracing packages in the OpenTelemetry SDK. The ticket below explains the issue in more details.</p><p><a href="https://github.com/open-telemetry/opentelemetry-go/issues/3685">Baggage not working with OpenTracing Bridge · Issue #3685 · open-telemetry/opentelemetry-go</a></p><p>Running against a patched version of OpenTelemetry SDK yielded the desired behavior and the baggage-reliant functionality was restored. I was getting performance metrics grouped by baggage values:</p><pre>$ curl http://127.0.0.1:8083/debug/vars | grep route.calc.by<br>"route.calc.by.customer.sec": {<br> "Amazing Coffee Roasters": 0.9080000000000004, <br> "Japanese Desserts": 1.0490000000000002, <br> "Rachel\'s Floral Designs": 1.0090000000000003, <br> "Trom Chocolatier": 1.0000000000000004<br>},<br>"route.calc.by.session.sec": {<br> "2885": 1.4760000000000002, <br> "5161": 2.4899999999999993<br>}</pre><p>And the mutex instrumentation was able to capture IDs of multiple transactions in the queue (see the <a href="https://medium.com/opentracing/take-opentracing-for-a-hotrod-ride-f6e3141f7941">original blog post</a> for explanation of this one):</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*lxeVafTIXsC7PVzkaOLIhA.png" /><figcaption>Logs show a transactions blocked on three other transactions.</figcaption></figure><h3>Summary</h3><p>Overall, the migration required fairly minimal amount of changes to the code, mostly because I chose to reuse the existing OpenTracing instrumentation and only swap the SDK from Jaeger to OpenTelemetry. The most friction with the migration was due to a couple of bugs in the OpenTelemetry Bridge code (and likely one place in the baggage package). This only leads me to believe that the baggage functionality is not yet widely used, especially when someone uses the OpenTracing instrumentation with a bridge to OpenTelemetry, so it is likely I just ran into a bunch of the early adopter issues.</p><p>At this point I am interested in taking the next step and doing a full migration of HotROD to OpenTelemetry (or help reviewing if someone wants to volunteer!) It could make a complementary Part 2 to this post to describe how that goes.</p><p>There is also a possible Part 3 involving a no-less interesting migration to the OpenTelemetry Metrics. Right now all of the Jaeger code base is using an internal abstraction for metrics backed by the Prometheus SDK.</p><p>Stay tuned.</p><img src="https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=29b09fe2fbc4" width="1" height="1" alt=""><hr><p><a href="https://medium.com/jaegertracing/experiment-migrating-opentracing-based-application-in-go-to-use-the-opentelemetry-sdk-29b09fe2fbc4">Experiment: Migrating OpenTracing-based application in Go to use the OpenTelemetry SDK</a> was originally published in <a href="https://medium.com/jaegertracing">JaegerTracing</a> on Medium, where people are continuing the conversation by highlighting and responding to this story.</p>',
author: 'Yuri Shkuro',
publishedAt: 1675921446,
}, {
id:
'medium-myuser-mycolumn-77107c9209365c6c6c601a9f018a05f4-c4bb87fb4dbeee1a531afbe4fc8b5433',
userId: 'myuser',
columnId: 'mycolumn',
sourceId: 'medium-myuser-mycolumn-77107c9209365c6c6c601a9f018a05f4',
title: 'Better alignment with OpenTelemetry by focusing on OTLP',
link:
'https://medium.com/jaegertracing/better-alignment-with-opentelemetry-by-focusing-on-otlp-f3688939073f?source=rss-cff2e4ba6058------2',
media:
'https://cdn-images-1.medium.com/max/1024/1*iFJFYZsdPvaFuwaAoZ1HRQ.jpeg',
description:
'<p>TL;DR: proposal (and a <a href="https://forms.gle/aUuJg5DQwNzncJ4s8">survey</a>) to deprecate native Jaeger exporters in OpenTelemetry SDKs in favor of OTLP exporters.</p><figure><img alt="" src="https://cdn-images-1.medium.com/max/1024/1*iFJFYZsdPvaFuwaAoZ1HRQ.jpeg" /><figcaption>Photo by <a href="https://unsplash.com/@miquel_parera_mila?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Miquel Parera</a> on <a href="https://unsplash.com/?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Unsplash</a></figcaption></figure><p>This is a re-post from the <a href="https://opentelemetry.io/blog/2022/jaeger-native-otlp/">OpenTelemetry blog article</a>.</p><p>By <a href="https://github.com/breedx-splk"><strong>Jason Plumb</strong></a><strong> (Splunk)</strong> | Thursday, November 03, 2022</p><p>Back in May of 2022, the Jaeger project <a href="https://medium.com/jaegertracing/introducing-native-support-for-opentelemetry-in-jaeger-eb661be8183c">announced native support for the OpenTelemetry Protocol</a> (OTLP). This followed a <a href="https://twitter.com/YuriShkuro/status/1455170693197402119">generous deprecation cycle</a> for the Jaeger client libraries across many languages. With these changes, OpenTelemetry users are now able to send traces into Jaeger with industry-standard OTLP, and the Jaeger client library repositories have been finally archived.</p><p>We intend to <strong>deprecate Jaeger exporters from OpenTelemetry</strong> in the near future, and are looking for your feedback to determine the length of the deprecation phase. The best way to provide feedback is by <a href="https://forms.gle/aUuJg5DQwNzncJ4s8">filling out a 4-question survey</a> or commenting on <a href="https://github.com/open-telemetry/opentelemetry-specification/pull/2858">the existing draft pull request</a>.</p><h3>OpenTelemetry Support</h3><p>This interoperability is a wonderful victory both for Jaeger users and for OpenTelemetry users. However, were not done yet. The OpenTelemetry specification still requires support for Jaeger client exporters across languages.</p><p>This causes challenges for both Jaeger users and OpenTelemetry maintainers:</p><ol><li><strong>Confusing Choices: </strong>Currently, users are faced with a choice of exporter (Jaeger or OTLP), and this can be a source of confusion. A user might be inclined, when exporting telemetry to Jaeger, to simply choose the Jaeger exporter because the name matches (even though Jaeger now actively encourages the use of OTLP).<br>If we can eliminate this potentially confusing choice, we can improve the user experience and continue standardizing on a single interoperable protocol. We love it when things “just work” out of the box!</li><li><strong>Maintenance and duplication: </strong>Because the Jaeger client libraries are now archived, they will not receive updates (including security patches). To continue properly supporting Jaeger client exporters, OpenTelemetry authors would be required to re-implement some of the functionality it had previously leveraged from the Jaeger clients.<br>Now that Jaeger supports OTLP, this feels like a step backwards: It results in an increased maintenance burden with very little benefit.</li></ol><h3>User Impact</h3><p>The proposal is to deprecate the following exporters from OpenTelemetry in favor of using native OTLP into Jaeger:</p><ul><li>Jaeger Thrift over HTTP</li><li>Jaeger Protobuf via gRPC</li><li>Jaeger Thrift over UDP</li></ul><p>In addition to application configuration changes, there could be other architectural considerations. HTTP and gRPC should be straightforward replacements, although it may require exposing ports 4317 and 4318 if they are not already accessible.</p><p>Thrift over UDP implies the use of the <a href="https://www.jaegertracing.io/docs/1.24/architecture/#agent">Jaeger Agent</a>. Users with this deployment configuration will need to make a slightly more complicated change, typically one of the following:</p><ol><li>Direct ingest. Applications will change from using Thrift+UDP to sending OTLP traces directly to their jaeger-collector instance. This may also have sampling implications.</li><li>Replacing the Jaeger Agent with a sidecar <a href="https://github.com/open-telemetry/opentelemetry-collector">OpenTelemetry Collector</a> instance. This could have sampling implications and requires changes to your infrastructure deployment code.</li></ol><h3>Intent to DeprecateWed Like Your Feedback!</h3><p>In order to better support users and the interop between OpenTelemetry and Jaeger, we intend to deprecate and eventually remove support for Jaeger client exporters / Jaeger native data format in OpenTelemetry.</p><p>We would like your feedback! We want to hear from users who could be impacted by this change. To better make a data-informed decision, <a href="https://forms.gle/aUuJg5DQwNzncJ4s8">we have put together a short 4-question survey</a>.</p><p>Your input will help us to choose how long to deprecate before removal.</p><p>A <a href="https://github.com/open-telemetry/opentelemetry-specification/pull/2858">draft PR has been created in the specification</a> to support this deprecation. If would like to contribute and provide feedback, visit the link above and add some comments. We want to hear from you.</p><img src="https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=f3688939073f" width="1" height="1" alt=""><hr><p><a href="https://medium.com/jaegertracing/better-alignment-with-opentelemetry-by-focusing-on-otlp-f3688939073f">Better alignment with OpenTelemetry by focusing on OTLP</a> was originally published in <a href="https://medium.com/jaegertracing">JaegerTracing</a> on Medium, where people are continuing the conversation by highlighting and responding to this story.</p>',
author: 'Yuri Shkuro',
publishedAt: 1667499175,
}, {
id:
'medium-myuser-mycolumn-77107c9209365c6c6c601a9f018a05f4-1c4fe0b61cf2c0ce653194b1c8ce3554',
userId: 'myuser',
columnId: 'mycolumn',
sourceId: 'medium-myuser-mycolumn-77107c9209365c6c6c601a9f018a05f4',
title: 'TEMPLE: Six Pillars of Observability',
link:
'https://medium.com/@YuriShkuro/temple-six-pillars-of-observability-4ac3e3deb402?source=rss-cff2e4ba6058------2',
media:
'https://cdn-images-1.medium.com/max/1024/1*1yZ4WP2IDrpFNpuf7tKkhQ.png',
description:
'<figure><img alt="A temple in Italy with six front pillars" src="https://cdn-images-1.medium.com/max/1024/1*1yZ4WP2IDrpFNpuf7tKkhQ.png" /><figcaption>Valley of the Temples, Agrigento, AG, Italy. Photo by <a href="https://unsplash.com/@belial90?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Dario Crisafulli</a> on <a href="https://unsplash.com/?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Unsplash</a>.</figcaption></figure><p>In the past few years, much has been talked and written about the “three pillars of observability”: metrics, logs, and traces. A Google search for the phrase brings up over 7,000 results, with almost every observability vendor having a blog post or an e-book on the topic. Recently, the term MELT started showing up that adds “events” to the mix as a distinct telemetry signal. In this post, I want to show that there are even more distinct types and introduce TEMPLE, which stands for traces, events, metrics, profiles, logs, and exceptions. I call them <em>six pillars of observability</em>, on one hand to make fun of the previous terms and acronyms, but also to make the case that these signals serve distinct use cases for observability of cloud-native systems. If I fail at the latter and you dont buy my arguments, at least the TEMPLE acronym works much better with “pillars” 😉.</p><p>Attribution: One of my colleagues at Meta, Henry Bond, started using the acronym TEMPL in the internal documents. I added “Exceptions”, for completeness, and ended up with TEMPLE.</p><h3>Six Pillars Explained</h3><p>I will try to illustrate why I think these six telemetry types deserve to be considered as separate. It does not mean some of them cannot be supported by the same backend, but they differ in the following aspects:</p><ul><li>How each telemetry type is produced</li><li>Which unique storage requirements they impose</li><li>How they are used in the user workflows</li></ul><p>Even though the TEMPLE acronym implies a certain ordering of the signals, I do not ascribe any meaning to that other than to make up a pleasant word. For better continuity of the explanation, I will go through them in a different order.</p><p>Also, naming is hard. I will point out how, surprisingly, most of the terms we use in this space are ambiguous, and the boundaries between telemetry types are not as strict as they appear to be.</p><p><strong>Metrics, the original pillar</strong>. Numerical measurements with attributes, which are easily aggregatable both spacially (along the attribute dimensions) and temporally (combining values into less discrete time intervals). Metrics aggregates remain highly accurate, which makes them great for monitoring, but aggregations lose the original level of details, which makes metrics not as good for troubleshooting & investigations.</p><p>In the context of cloud native applications, metrics usually refer to <em>operational metrics</em> of the software. <em>Business metrics</em> are actually a different category that is better captured via structured logs.</p><p><strong>Logs, the ancient pillar</strong> (if you question which came first, “ancient” or “original”, take it up with Marvel). Logs are a confusing category, ranging from arbitrary printf-like statements (aka unstructured logs) to highly structured and even schematized events. When structured logs are schematized (cf. <a href="https://research.facebook.com/publications/positional-paper-schema-first-application-telemetry/">Schema-first Application Telemetry</a>), they are often sent to different tables in the warehouse and used for analytics, including business analytics. Schema-free structured logs are what Honeycomb calls “arbitrarily-wide events”. Its a bit of a misnomer, because each individual log record is not “arbitrarily wide”, in fact it usually has a completely static shape that will not change in the given call site, but when we mix these log records from different call sites we can expect all kinds of shapes, making the resulting destination an “arbitrarily wide” table. Most modern logging backends can ingest structured logs and allow search and analytics on these arbitrary dimensions.</p><p>Many logs are generated in response to a service processing specific input requests, i.e. they are <em>request-scoped</em>. We, as an industry, havent really figured out how best to deal with request-scoped logs. On one hand, they look like other logs and can be stored and analyzed in a similar fashion. On the other hand, distributed tracing is specifically designed as request-scoped logging, so these logs could be much more useful when viewed in the context of a distributed trace. I met engineers whose teams chose to use tracing APIs exclusively to capture logs, so that those can be always visualized in the context of traces. A common approach to solve this is to capture trace ID as a field in the logs and build cross-correlation between logging and tracing tools.</p><p><strong>Traces, the “new cool kid on the block” pillar</strong>. The term <em>tracing</em> is quite overloaded (just look at <a href="https://docs.kernel.org/trace/index.html">Linux Tracing</a> documentation). In the context of cloud native observability, tracing usually refers to <em>distributed tracing</em>, or a special form of structured logs that are <em>request-scoped</em>, or more generally <em>workflow-centric</em>. In contrast to plain structured logging, tracing captures not only the log records, but also <em>causality</em> between them, and once those records are stitched into a trace they represent the trajectory of a single request (or a workflow) through a distributed system end-to-end.</p><p>Tracing opens up a realm of possibilities for reasoning about a system:</p><ul><li>Unique monitoring capabilities, e.g., an end-to-end latency of messaging workflows, which is difficult to observe with any other telemetry.</li><li>Debugging capabilities, in particular, root cause isolation. Traces may not always tell you why a system is misbehaving, but they often can narrow down which of the thousands of distributed components is at fault.</li><li>Resource usage attribution by different caller profiles or business lines.</li></ul><p><strong>Events, the misunderstood pillar</strong>. This is perhaps the worst-named category of telemetry signals because, strictly speaking, pretty much all telemetry is “events”. What people usually mean by this category is <em>change events</em>, i.e., events that are external to the observed system that cause some changes in that system. The most common examples are: deployments of application code (and the corresponding code commits), configuration changes, experiments, DR-related traffic drains, perhaps auto-scaling events, etc.</p><p>There is no practical bound to what could be considered an event that affects the system. For instance, in the early days of Uber, Halloween was the night of the highest user traffic, such that SREs would spend the whole night in the “war room”, monitoring the system and firefighting (Big Bang Theory flashback: Howard: “guidance system for drunk people”, Raj: “they already had that, its called Uber”). As the business became more global, the impact of Halloween as US-centric holiday became less pronounced on the system traffic, but one can easily see how holidays, or some other public events like sports or concerts, can become factors that affect the behavior of a system and might be useful to show to the operators as part of the systems observability.</p><p>One could reasonably ask: why cant we treat events simply as structured logs? As far as the data shape in the storage, there is indeed not much difference. However, logs usually require less rigor from the backends capturing them: some level of data loss may be acceptable, and the pipelines are often set up to down-sample or throttle the logs. For example, if a bug is causing an application to log a certain error message, its likely that well have many similar logs, so its not critical to guarantee that every one of them is stored. This is very different from the handling of change events, which should be all stored reliably, because if we miss the record about that one single code deployment that caused the issue and needs to be rolled back, our outage investigation might take much longer. Similarly, when querying for logs, its usually sufficient to find some samples of a pattern, or to get aggregate statistics, there is not much emphasis on finding a very specific individual log record. But with change events, were precisely looking for very specific instances. Finally, change events are usually produced in much fewer volumes than logs. These differences in the requirements often lead to different designs and trade-offs in the logs and events backends.</p><p><strong>Profiles, the geek pillar</strong>. Profiles are another category of telemetry that is tricky to define, although you would know it when you see one. Profiles are just being introduced as a new signal to the OpenTelemetry scope in the <a href="https://github.com/open-telemetry/oteps/pull/212">OTEP-212</a>, and even that document had a bit of a tough time defining what a profile is. Its latest definition is “a collection of stack traces with some metric associated with each stack trace, typically representing the number of times that stack trace was encountered”. Mouthful.</p><p>Many engineers encounter profiling tasks at some point, but from my experience most of them do not have to deal with profiles very often, unless they specialize in performance and efficiency optimizations. Profiling tools, as a result, tend to be somewhat esoteric, focusing on power users.</p><p>Profiles, unlike most other telemetry types, almost never require explicit instrumentation, instead relying on deeper integration with the runtimes to capture the call stacks. They often generate very large amounts of data, requiring specially designed backends.</p><p><strong>Exceptions, the forgotten pillar</strong>. Finally, lets not forget the exceptions. Remember the “Stacktrace or GTFO” comics? When I first came across it, I was working at an investment bank developing trading systems, and we had zero observability into the system running in production (we could get access to logs on the hosts, but only by going through a special permissions escalation process, because … lawyers). So the comics resonated with me a lot at the time. But years later, my attitude changed to “couldnt he just use Sentry or something?”</p><p>My first experience with Sentry was by accident. We just integrated Jaeger SDK into a Python framework that was widely used at Uber. Next morning I am getting a UBN ticket (“UnBreakNow”, i.e., high urgency) that says they are getting errors in production and the stacktrace points to Jaeger SDK code. But instead of a stacktrace the ticket had a link to Sentry, which was the open source system Uber deployed to capture and aggregate exceptions. I was blown away by the amount of information captured by Raven (Sentrys SDK) besides the stacktrace itself. The most useful was the ability to inspect values of all local variables at every frame of the stack. That immediately revealed the root cause, which had to do with the handling of utf-8 strings.</p><p>Exceptions are, strictly speaking, a specialized form of structured logs, although you may need much more structure than the typical structured logging API allows (like nested collections, etc.) The processing pipelines for exceptions are also pretty specialized: they often involve symbolication, fingerprinting, stacks pruning, and deduplication. Finally, the UI for viewing this data is also highly customized to this data source. All these factors lead me to conclude that exceptions should really be treated as an independent telemetry type.</p><h3>Pillars are not Observability</h3><p>Now that we covered the six pillars, its worth remembering that pillars do not guarantee observability, which is defined, perhaps counterintuitively, as the ability to <em>understand</em> the system (s internal state) from its outputs, not just to <em>observe the outputs</em>. These pillars are just different types of telemetry that can be produced, the raw data. To be effective in investigations, the observability platform needs to be able to combine these signals into solutions for specific workflows. Even with free-form investigations, where you are “on your own” because all the guided investigation workflows failed, the platform can provide many features to assist you, such as understanding the metadata of the telemetry and allowing cross-telemetry correlations, or automation of insights and pattern recognition. Pillars are what you build upon, not the end goal.</p><h3>Takeaways</h3><ol><li>Stop saying “three pillars”. There are more than three.</li><li>Start saying TEMPLE, if you must name them.</li><li>Dont take it seriously. The boundaries are diffuse.</li><li>Pillars ≠ observability, they are just data.</li></ol><img src="https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=4ac3e3deb402" width="1" height="1" alt="">',
author: 'Yuri Shkuro',
publishedAt: 1663552288,
}]);
} finally {
fetchWithTimeoutSpy.restore();
getFaviconSpy.restore();
}
assertSpyCall(fetchWithTimeoutSpy, 0, {
args: ['https://medium.com/feed/@YuriShkuro', { method: 'get' }, 5000],
returned: new Promise((resolve) => {
resolve(new Response(responseUser, { status: 200 }));
}),
});
assertSpyCall(getFaviconSpy, 0, {
args: [
'https://medium.com/@YuriShkuro?source=rss-cff2e4ba6058------2',
faviconFilter,
],
returned: new Promise((resolve) => {
resolve(undefined);
}),
});
assertSpyCalls(fetchWithTimeoutSpy, 1);
assertSpyCalls(getFaviconSpy, 1);
});