Skip to content

Commit 78490da

Browse files
authored
[POA-2516] Parse and persist invalid JSON/YAML bodies (#56)
- Added support for capturing unparsable bodies as raw string - Also indicates the same using the new Errors field added with the body meta.
1 parent 959c4f4 commit 78490da

File tree

5 files changed

+163
-46
lines changed

5 files changed

+163
-46
lines changed

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ require (
66
github.com/AlecAivazis/survey/v2 v2.3.6
77
github.com/OneOfOne/xxhash v1.2.8
88
github.com/Pallinder/go-randomdata v1.2.0
9-
github.com/akitasoftware/akita-ir v0.0.0-20241008173748-ca8e2e3d5db4
10-
github.com/akitasoftware/akita-libs v0.0.0-20241115053127-0af65b813d9c
9+
github.com/akitasoftware/akita-ir v0.0.0-20241213050034-057d7b6097e8
10+
github.com/akitasoftware/akita-libs v0.0.0-20241213051250-7fac08ba3594
1111
github.com/akitasoftware/go-utils v0.0.0-20240213133309-b95d4ace8803
1212
github.com/andybalholm/brotli v1.0.1
1313
github.com/aws/aws-sdk-go-v2 v1.17.1

go.sum

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,22 @@ github.com/agext/levenshtein v1.2.1 h1:QmvMAjj2aEICytGiWzmxoE0x2KZvE0fvmqMOfy2tj
2828
github.com/agext/levenshtein v1.2.1/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
2929
github.com/akitasoftware/akita-ir v0.0.0-20241008173748-ca8e2e3d5db4 h1:jbsA7E68G4dkK+ldQKQNPFIaRojFCn+cEB3gEEUSEU4=
3030
github.com/akitasoftware/akita-ir v0.0.0-20241008173748-ca8e2e3d5db4/go.mod h1:WEWPzhZtxlJnov3MxcqSDiZaHHf00vs3aJwCdt3OwzA=
31+
github.com/akitasoftware/akita-ir v0.0.0-20241210134130-1db037ecc549 h1:r5EbkpsLDonkIEQWdUqNwI1FaI0/kThShk0zD0KC08E=
32+
github.com/akitasoftware/akita-ir v0.0.0-20241210134130-1db037ecc549/go.mod h1:WEWPzhZtxlJnov3MxcqSDiZaHHf00vs3aJwCdt3OwzA=
33+
github.com/akitasoftware/akita-ir v0.0.0-20241212105914-68c47ec34a0a h1:7qu19Mmi+I9AwDZqrbGUPYmsyXu/UbwRFnlnvjyuboQ=
34+
github.com/akitasoftware/akita-ir v0.0.0-20241212105914-68c47ec34a0a/go.mod h1:WEWPzhZtxlJnov3MxcqSDiZaHHf00vs3aJwCdt3OwzA=
35+
github.com/akitasoftware/akita-ir v0.0.0-20241213050034-057d7b6097e8 h1:sf2aaOG8o+tAIMxuk7peMkKrHPp9l3/r+JmiQ3dwJ5U=
36+
github.com/akitasoftware/akita-ir v0.0.0-20241213050034-057d7b6097e8/go.mod h1:WEWPzhZtxlJnov3MxcqSDiZaHHf00vs3aJwCdt3OwzA=
3137
github.com/akitasoftware/akita-libs v0.0.0-20241115053127-0af65b813d9c h1:EFEyv9FTVqSbCKfiSlXEAcNJQoJwMpIGjHuc4uR1Bgk=
3238
github.com/akitasoftware/akita-libs v0.0.0-20241115053127-0af65b813d9c/go.mod h1:5rzkpaJI1sA9CtGELYZpf15oczAVopcyda2WsYG5xno=
39+
github.com/akitasoftware/akita-libs v0.0.0-20241210150553-bc063b60bd85 h1:uox18K4ZFvxzzNlyXL7Tzlk3FBE54Vd3auDYFzDHxvc=
40+
github.com/akitasoftware/akita-libs v0.0.0-20241210150553-bc063b60bd85/go.mod h1:mvXnUbqFuKt/PsZ4rhpBrKx0sjFhIkwIQLwmj9F4Zbc=
41+
github.com/akitasoftware/akita-libs v0.0.0-20241212110122-6626e5bb189c h1:UqBlV2CPdorbKE/xlrcnrHxzOYq562gEiv3mlHZ+2uA=
42+
github.com/akitasoftware/akita-libs v0.0.0-20241212110122-6626e5bb189c/go.mod h1:RTaDmDZR5Ytg7i+2qcvzCJByhCDY9cIkdPfy7eV30/M=
43+
github.com/akitasoftware/akita-libs v0.0.0-20241212114105-28b291366b0a h1:BecrRvaXJIbgFD3qh5v8OVSI/yVUQb9ohYRW7BWHXrA=
44+
github.com/akitasoftware/akita-libs v0.0.0-20241212114105-28b291366b0a/go.mod h1:RTaDmDZR5Ytg7i+2qcvzCJByhCDY9cIkdPfy7eV30/M=
45+
github.com/akitasoftware/akita-libs v0.0.0-20241213051250-7fac08ba3594 h1:OLA8tSW2p8ECSP8w8ldZkBVI769hUa1/E48cfM6+s7w=
46+
github.com/akitasoftware/akita-libs v0.0.0-20241213051250-7fac08ba3594/go.mod h1:Fg14kX6+N7we3KdP1c11W/SzbKsgapV1hP5d4Z/Hqwc=
3347
github.com/akitasoftware/go-utils v0.0.0-20240213133309-b95d4ace8803 h1:ebIh/EFuaP8GczzMe8EwVID/blSv5Tej6S8NE4xyarQ=
3448
github.com/akitasoftware/go-utils v0.0.0-20240213133309-b95d4ace8803/go.mod h1:+IOXf7l/QCAQECJzjJwhTp1sBkRoJ6WciZwJezUwBa4=
3549
github.com/akitasoftware/gopacket v1.1.18-0.20240820200020-7289ae956f70 h1:VnU7QLDBwRujpQoHwShs5yu0Ahv1fSalNJa4UijwlmY=

learn/parse_http.go

Lines changed: 77 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626
"github.com/google/uuid"
2727
"github.com/pkg/errors"
2828
"github.com/postmanlabs/postman-insights-agent/printer"
29-
"github.com/postmanlabs/postman-insights-agent/telemetry"
3029
"golang.org/x/text/encoding/ianaindex"
3130
"golang.org/x/text/transform"
3231
"gopkg.in/yaml.v2"
@@ -145,14 +144,16 @@ func ParseHTTP(elem akinet.ParsedNetworkContent) (*PartialWitness, error) {
145144
}
146145

147146
if err != nil {
148-
// Just log an error instead of returning an error so users can see the
149-
// other parts of the endpoint in the spec rather than an empty spec.
150-
// https://app.clubhouse.io/akita-software/story/1898/juan-s-payload-problem
151-
telemetry.RateLimitError("unparsable body", err)
152-
printer.Debugf("skipping unparsable body: %v\n", err)
153-
} else if bodyData != nil {
154-
datas = append(datas, bodyData)
147+
// When the body is unparsable even after attempting fallback decompressions,
148+
// we will try to capture the body as a string and indicate parsing error in the body meta
149+
bodyStream := rawBody.CreateReader()
150+
// we are ignoring the error from decodeBody here
151+
// if the decodeStream is nil, we will capture a placeholder bodyData that would say we received an unparsable body
152+
decodeStream, _ := decodeBody(headers, bodyStream, bodyDecompressed)
153+
bodyData = captureUnparsableBody(decodeStream, contentType, statusCode)
155154
}
155+
156+
datas = append(datas, bodyData)
156157
}
157158

158159
method := &pb.Method{Id: UnassignedHTTPID(), Meta: methodMeta}
@@ -307,32 +308,7 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
307308
return parseMultipartBody("mixed", mediaParams["boundary"], bodyStream, statusCode)
308309
}
309310

310-
// Otherwise, use media type to decide how to parse the body.
311-
// TODO: XML parsing
312-
// TODO: application/json-seq (RFC 7466)?
313-
// TODO: more text/* types
314-
var parseBodyDataAs pb.HTTPBody_ContentType
315-
switch mediaType {
316-
case "application/json":
317-
parseBodyDataAs = pb.HTTPBody_JSON
318-
case "application/x-www-form-urlencoded":
319-
parseBodyDataAs = pb.HTTPBody_FORM_URL_ENCODED
320-
case "application/yaml", "application/x-yaml", "text/yaml", "text/x-yaml":
321-
parseBodyDataAs = pb.HTTPBody_YAML
322-
case "application/octet-stream":
323-
parseBodyDataAs = pb.HTTPBody_OCTET_STREAM
324-
case "text/plain", "text/csv":
325-
parseBodyDataAs = pb.HTTPBody_TEXT_PLAIN
326-
case "text/html":
327-
parseBodyDataAs = pb.HTTPBody_TEXT_HTML
328-
default:
329-
// Handle custom JSON-encoded media types.
330-
if strings.HasSuffix(mediaType, "+json") {
331-
parseBodyDataAs = pb.HTTPBody_JSON
332-
} else {
333-
parseBodyDataAs = pb.HTTPBody_OTHER
334-
}
335-
}
311+
parseBodyDataAs := getContentTypeFromMediaType(mediaType)
336312

337313
var bodyData *pb.Data
338314

@@ -443,6 +419,73 @@ func parseBody(contentType string, bodyStream io.Reader, statusCode int) (*pb.Da
443419
return bodyData, nil
444420
}
445421

422+
// When we can't parse the body, we will try to capture it as a raw primitive string and
423+
// indicate the parsing error in the body meta.
424+
func captureUnparsableBody(bodyStream io.Reader, contentType string, statusCode int) *pb.Data {
425+
mediaType, _, _ := mime.ParseMediaType(contentType)
426+
bodyData := &pb.Data{
427+
Value: newDataPrimitive(categorizeStringToPrimitive("Cannot parse body")),
428+
Meta: newDataMetaHTTPMeta(&pb.HTTPMeta{
429+
Location: &pb.HTTPMeta_Body{
430+
Body: &pb.HTTPBody{
431+
ContentType: getContentTypeFromMediaType(mediaType),
432+
OtherType: contentType,
433+
Errors: pb.HTTPBody_PARSING_ERROR,
434+
},
435+
},
436+
ResponseCode: int32(statusCode),
437+
}),
438+
}
439+
440+
if bodyStream == nil {
441+
return bodyData
442+
}
443+
444+
// Categorize the body as a string.
445+
buf := new(strings.Builder)
446+
_, err := io.Copy(buf, bodyStream)
447+
if err != nil {
448+
return bodyData
449+
}
450+
451+
bodyData.Value = newDataPrimitive(categorizeStringToPrimitive(buf.String()))
452+
return bodyData
453+
}
454+
455+
// Gets the content type to use for parsing the body based on the media type.
456+
// E.g. application/json -> JSON, application/x-www-form-urlencoded -> FORM_URL_ENCODED.
457+
// Also handles the case where the media type is a custom JSON-encoded media type.
458+
func getContentTypeFromMediaType(mediaType string) pb.HTTPBody_ContentType {
459+
// Use media type to decide how to parse the body.
460+
// TODO: XML parsing
461+
// TODO: application/json-seq (RFC 7466)?
462+
// TODO: more text/* types
463+
var parseBodyDataAs pb.HTTPBody_ContentType
464+
switch mediaType {
465+
case "application/json":
466+
parseBodyDataAs = pb.HTTPBody_JSON
467+
case "application/x-www-form-urlencoded":
468+
parseBodyDataAs = pb.HTTPBody_FORM_URL_ENCODED
469+
case "application/yaml", "application/x-yaml", "text/yaml", "text/x-yaml":
470+
parseBodyDataAs = pb.HTTPBody_YAML
471+
case "application/octet-stream":
472+
parseBodyDataAs = pb.HTTPBody_OCTET_STREAM
473+
case "text/plain", "text/csv":
474+
parseBodyDataAs = pb.HTTPBody_TEXT_PLAIN
475+
case "text/html":
476+
parseBodyDataAs = pb.HTTPBody_TEXT_HTML
477+
default:
478+
// Handle custom JSON-encoded media types.
479+
if strings.HasSuffix(mediaType, "+json") {
480+
parseBodyDataAs = pb.HTTPBody_JSON
481+
} else {
482+
parseBodyDataAs = pb.HTTPBody_OTHER
483+
}
484+
}
485+
486+
return parseBodyDataAs
487+
}
488+
446489
func parseMultipartBody(multipartType string, boundary string, bodyStream io.Reader, statusCode int) (*pb.Data, error) {
447490
fields := map[string]*pb.Data{}
448491
r := multipart.NewReader(bodyStream, boundary)

learn/parse_http_test.go

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
as "github.com/akitasoftware/akita-ir/go/api_spec"
1111
"github.com/akitasoftware/akita-libs/akinet"
1212
"github.com/akitasoftware/akita-libs/spec_util"
13+
"github.com/akitasoftware/go-utils/optionals"
1314
"github.com/postmanlabs/postman-insights-agent/telemetry"
1415
"github.com/spf13/viper"
1516
)
@@ -20,6 +21,7 @@ const (
2021
)
2122

2223
var deflatedBody bytes.Buffer
24+
var deflatedInvalidJSONBody bytes.Buffer
2325

2426
func init() {
2527
dw, err := flate.NewWriter(&deflatedBody, flate.BestCompression)
@@ -28,6 +30,14 @@ func init() {
2830
}
2931
dw.Write([]byte(`{"34302ecf": "this is prince"}`))
3032
dw.Close()
33+
34+
dw, err = flate.NewWriter(&deflatedInvalidJSONBody, flate.BestCompression)
35+
if err != nil {
36+
panic(err)
37+
}
38+
39+
dw.Write([]byte(`{"key": invalid JSON}`))
40+
dw.Close()
3141
}
3242

3343
var testBodyDict = `
@@ -84,11 +94,17 @@ func newTestBodySpecContentType(contentType string, statusCode int) *as.Data {
8494
}
8595

8696
func newTestBodySpecFromStruct(statusCode int, contentType as.HTTPBody_ContentType, originalContentType string, s map[string]*as.Data) *as.Data {
87-
return newTestBodySpecFromData(statusCode, contentType, originalContentType, dataFromStruct(s))
97+
return newTestBodySpecFromData(statusCode, contentType, originalContentType, dataFromStruct(s), optionals.None[as.HTTPBody_Errors]())
8898
}
8999

90-
func newTestBodySpecFromData(statusCode int, contentType as.HTTPBody_ContentType, originalContentType string, d *as.Data) *as.Data {
91-
d.Meta = newBodyDataMeta(statusCode, contentType, originalContentType)
100+
func newTestBodySpecFromData(
101+
statusCode int,
102+
contentType as.HTTPBody_ContentType,
103+
originalContentType string,
104+
d *as.Data,
105+
bodyError optionals.Optional[as.HTTPBody_Errors],
106+
) *as.Data {
107+
d.Meta = newBodyDataMeta(statusCode, contentType, originalContentType, bodyError)
92108
return d
93109
}
94110

@@ -99,7 +115,7 @@ func newTestMultipartFormData(statusCode int) *as.Data {
99115
Value: &as.Data_Struct{
100116
Struct: &as.Struct{
101117
Fields: map[string]*as.Data{
102-
"field1": newTestBodySpecFromData(statusCode, as.HTTPBody_TEXT_PLAIN, "text/plain", f1),
118+
"field1": newTestBodySpecFromData(statusCode, as.HTTPBody_TEXT_PLAIN, "text/plain", f1, optionals.None[as.HTTPBody_Errors]()),
103119
"field2": newTestBodySpecFromStruct(statusCode, as.HTTPBody_JSON, "application/json", map[string]*as.Data{
104120
"foo": dataFromPrimitive(spec_util.NewPrimitiveString("bar")),
105121
"baz": dataFromPrimitive(spec_util.NewPrimitiveInt64(123)),
@@ -356,6 +372,7 @@ func TestParseHTTPRequest(t *testing.T) {
356372
as.HTTPBody_OCTET_STREAM,
357373
"application/octet-stream",
358374
dataFromPrimitive(spec_util.NewPrimitiveBytes([]byte("prince is a good boy"))),
375+
optionals.None[as.HTTPBody_Errors](),
359376
),
360377
},
361378
UnknownHTTPMethodMeta(),
@@ -441,6 +458,32 @@ prince:
441458
UnknownHTTPMethodMeta(),
442459
),
443460
},
461+
&parseTest{
462+
// Capture the unparsable body and indicate a parsing error in body metadata
463+
name: "compressed body with invalid JSON should capture stringified body",
464+
testContent: newTestHTTPResponse(
465+
200,
466+
deflatedInvalidJSONBody.Bytes(),
467+
"application/json",
468+
map[string][]string{"Content-Encoding": {"deflate"}},
469+
[]*http.Cookie{},
470+
),
471+
expectedMethod: newMethod(
472+
nil,
473+
[]*as.Data{
474+
newDataHeader("Content-Encoding", 200, spec_util.NewPrimitiveString("deflate"), false),
475+
newTestBodySpecFromData(
476+
200,
477+
as.HTTPBody_JSON,
478+
"application/json",
479+
480+
dataFromPrimitive(spec_util.NewPrimitiveString("{\"key\": invalid JSON}")),
481+
optionals.Some(as.HTTPBody_PARSING_ERROR),
482+
),
483+
},
484+
UnknownHTTPMethodMeta(),
485+
),
486+
},
444487
&parseTest{
445488
// Test our fallback mechanism for auto-detecting compressed bodies.
446489
// https://app.clubhouse.io/akita-software/story/1656
@@ -468,10 +511,8 @@ prince:
468511
),
469512
},
470513
&parseTest{
471-
// Log error and skip the body if we can't parse it, instead of aborting
472-
// the whole endpoint.
473-
// https://app.clubhouse.io/akita-software/story/1898/juan-s-payload-problem
474-
name: "skip body if unable to parse",
514+
// Capture the unparsable body and indicate a parsing error in body metadata
515+
name: "uncompressed body with invalid JSON should capture stringified body",
475516
testContent: newTestHTTPResponse(
476517
200,
477518
[]byte("I am not JSON"),
@@ -485,6 +526,13 @@ prince:
485526
nil,
486527
[]*as.Data{
487528
newDataHeader("X-Charming-Level", 200, spec_util.NewPrimitiveString("extreme"), false),
529+
newTestBodySpecFromData(
530+
200,
531+
as.HTTPBody_JSON,
532+
"application/json",
533+
dataFromPrimitive(spec_util.NewPrimitiveString("I am not JSON")),
534+
optionals.Some(as.HTTPBody_PARSING_ERROR),
535+
),
488536
},
489537
UnknownHTTPMethodMeta(),
490538
),

learn/util_test.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/akitasoftware/akita-libs/memview"
1111
"github.com/akitasoftware/akita-libs/pbhash"
1212
"github.com/akitasoftware/akita-libs/spec_util"
13+
"github.com/akitasoftware/go-utils/optionals"
1314
"github.com/golang/protobuf/proto"
1415
"github.com/google/go-cmp/cmp"
1516
"github.com/pkg/errors"
@@ -190,8 +191,13 @@ func dataFromPrimitive(p *pb.Primitive) *pb.Data {
190191
return &pb.Data{Value: &pb.Data_Primitive{Primitive: p}}
191192
}
192193

193-
func newBodyDataMeta(responseCode int, contentType pb.HTTPBody_ContentType, originalContentType string) *pb.DataMeta {
194-
return newDataMeta(&pb.HTTPMeta{
194+
func newBodyDataMeta(
195+
responseCode int,
196+
contentType pb.HTTPBody_ContentType,
197+
originalContentType string,
198+
bodyErrorOpt optionals.Optional[pb.HTTPBody_Errors],
199+
) *pb.DataMeta {
200+
dataMeta := newDataMeta(&pb.HTTPMeta{
195201
Location: &pb.HTTPMeta_Body{
196202
Body: &pb.HTTPBody{
197203
ContentType: contentType,
@@ -200,6 +206,12 @@ func newBodyDataMeta(responseCode int, contentType pb.HTTPBody_ContentType, orig
200206
},
201207
ResponseCode: int32(responseCode),
202208
})
209+
210+
if bodyError, exists := bodyErrorOpt.Get(); exists {
211+
dataMeta.GetHttp().GetBody().Errors = bodyError
212+
}
213+
214+
return dataMeta
203215
}
204216

205217
func annotateIfSensitiveForTest(sensitive bool, prim *pb.Primitive) *pb.Primitive {

0 commit comments

Comments
 (0)