Skip to content

Commit 1601eb9

Browse files
Enable bypass of ip limitations via ENV in collector processing (#3652)
* Enable bypass of ip limitations via ENV in collector startup resolves #3625 connect #3626 * dev build * bump dockerx build action * enable runtime setting config of collector requests * comments and linting for option passing * unset * unset * update docs link * linting and docs
1 parent fd4929b commit 1601eb9

File tree

8 files changed

+173
-6
lines changed

8 files changed

+173
-6
lines changed

.github/workflows/dev-build.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ concurrency:
66

77
on:
88
push:
9-
branches: ['na'] # put your current branch to create a build. Core team only.
9+
branches: ['3625-bypass-ip-check'] # put your current branch to create a build. Core team only.
1010
paths-ignore:
1111
- '**.md'
1212
- 'cloud-deployments/*'

collector/middleware/verifyIntegrity.js

+7-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
const { CommunicationKey } = require("../utils/comKey");
2+
const RuntimeSettings = require("../utils/runtimeSettings");
3+
const runtimeSettings = new RuntimeSettings();
24

35
function verifyPayloadIntegrity(request, response, next) {
46
const comKey = new CommunicationKey();
57
if (process.env.NODE_ENV === "development") {
6-
comKey.log('verifyPayloadIntegrity is skipped in development.')
8+
comKey.log('verifyPayloadIntegrity is skipped in development.');
9+
runtimeSettings.parseOptionsFromRequest(request);
710
next();
811
return;
912
}
@@ -12,7 +15,9 @@ function verifyPayloadIntegrity(request, response, next) {
1215
if (!signature) return response.status(400).json({ msg: 'Failed integrity signature check.' })
1316

1417
const validSignedPayload = comKey.verify(signature, request.body);
15-
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' })
18+
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' });
19+
20+
runtimeSettings.parseOptionsFromRequest(request);
1621
next();
1722
}
1823

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
const { reqBody } = require("../http");
2+
3+
/**
4+
* Runtime settings are used to configure the collector per-request.
5+
* These settings are persisted across requests, but can be overridden per-request.
6+
*
7+
* The settings are passed in the request body via `options.runtimeSettings`
8+
* which is set in the backend #attachOptions function in CollectorApi.
9+
*
10+
* We do this so that the collector and backend can share the same ENV variables
11+
* but only pass the relevant settings to the collector per-request and be able to
12+
* access them across the collector via a single instance of RuntimeSettings.
13+
*
14+
* TODO: We may want to set all options passed from backend to collector here,
15+
* but for now - we are only setting the runtime settings specifically for backwards
16+
* compatibility with existing CollectorApi usage.
17+
*/
18+
class RuntimeSettings {
19+
static _instance = null;
20+
settings = {};
21+
22+
// Any settings here will be persisted across requests
23+
// and must be explicitly defined here.
24+
settingConfigs = {
25+
allowAnyIp: {
26+
default: false,
27+
// Value must be explicitly "true" or "false" as a string
28+
validate: (value) => String(value) === "true",
29+
},
30+
};
31+
32+
constructor() {
33+
if (RuntimeSettings._instance) return RuntimeSettings._instance;
34+
RuntimeSettings._instance = this;
35+
return this;
36+
}
37+
38+
/**
39+
* Parse the runtime settings from the request body options body
40+
* see #attachOptions https://github.com/Mintplex-Labs/anything-llm/blob/ebf112007e0d579af3d2b43569db95bdfc59074b/server/utils/collectorApi/index.js#L18
41+
* @param {import('express').Request} request
42+
* @returns {void}
43+
*/
44+
parseOptionsFromRequest(request = {}) {
45+
const options = reqBody(request)?.options?.runtimeSettings || {};
46+
for (const [key, value] of Object.entries(options)) {
47+
if (!this.settingConfigs.hasOwnProperty(key)) continue;
48+
this.set(key, value);
49+
}
50+
return;
51+
}
52+
53+
/**
54+
* Get a runtime setting
55+
* - Will throw an error if the setting requested is not a supported runtime setting key
56+
* - Will return the default value if the setting requested is not set at all
57+
* @param {string} key
58+
* @returns {any}
59+
*/
60+
get(key) {
61+
if (!this.settingConfigs[key])
62+
throw new Error(`Invalid runtime setting: ${key}`);
63+
return this.settings.hasOwnProperty(key)
64+
? this.settings[key]
65+
: this.settingConfigs[key].default;
66+
}
67+
68+
/**
69+
* Set a runtime setting
70+
* - Will throw an error if the setting requested is not a supported runtime setting key
71+
* - Will validate the value against the setting's validate function
72+
* @param {string} key
73+
* @param {any} value
74+
* @returns {void}
75+
*/
76+
set(key, value = null) {
77+
if (!this.settingConfigs[key])
78+
throw new Error(`Invalid runtime setting: ${key}`);
79+
this.settings[key] = this.settingConfigs[key].validate(value);
80+
}
81+
}
82+
83+
module.exports = RuntimeSettings;

collector/utils/url/index.js

+18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
const RuntimeSettings = require("../runtimeSettings");
12
/** ATTN: SECURITY RESEARCHERS
23
* To Security researchers about to submit an SSRF report CVE - please don't.
34
* We are aware that the code below is does not defend against any of the thousands of ways
@@ -13,15 +14,24 @@
1314

1415
const VALID_PROTOCOLS = ["https:", "http:"];
1516
const INVALID_OCTETS = [192, 172, 10, 127];
17+
const runtimeSettings = new RuntimeSettings();
1618

1719
/**
1820
* If an ip address is passed in the user is attempting to collector some internal service running on internal/private IP.
1921
* This is not a security feature and simply just prevents the user from accidentally entering invalid IP addresses.
22+
* Can be bypassed via COLLECTOR_ALLOW_ANY_IP environment variable.
2023
* @param {URL} param0
2124
* @param {URL['hostname']} param0.hostname
2225
* @returns {boolean}
2326
*/
2427
function isInvalidIp({ hostname }) {
28+
if (runtimeSettings.get("allowAnyIp")) {
29+
console.log(
30+
"\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
31+
);
32+
return false;
33+
}
34+
2535
const IPRegex = new RegExp(
2636
/^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$/gi
2737
);
@@ -40,6 +50,14 @@ function isInvalidIp({ hostname }) {
4050
return INVALID_OCTETS.includes(Number(octetOne));
4151
}
4252

53+
/**
54+
* Validates a URL
55+
* - Checks the URL forms a valid URL
56+
* - Checks the URL is at least HTTP(S)
57+
* - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
58+
* @param {string} url
59+
* @returns {boolean}
60+
*/
4361
function validURL(url) {
4462
try {
4563
const destination = new URL(url);

docker/.env.example

+4
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,10 @@ GID='1000'
322322
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
323323
# SIMPLE_SSO_ENABLED=1
324324

325+
# Allow scraping of any IP address in collector - must be string "true" to be enabled
326+
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
327+
# COLLECTOR_ALLOW_ANY_IP="true"
328+
325329
# Specify the target languages for when using OCR to parse images and PDFs.
326330
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
327331
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.

server/.env.example

+4
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,10 @@ TTS_PROVIDER="native"
311311
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
312312
# SIMPLE_SSO_ENABLED=1
313313

314+
# Allow scraping of any IP address in collector - must be string "true" to be enabled
315+
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
316+
# COLLECTOR_ALLOW_ANY_IP="true"
317+
314318
# Specify the target languages for when using OCR to parse images and PDFs.
315319
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
316320
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.

server/utils/collectorApi/index.js

+53-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
const { EncryptionManager } = require("../EncryptionManager");
22

3+
/**
4+
* @typedef {Object} CollectorOptions
5+
* @property {string} whisperProvider - The provider to use for whisper, defaults to "local"
6+
* @property {string} WhisperModelPref - The model to use for whisper if set.
7+
* @property {string} openAiKey - The API key to use for OpenAI interfacing, mostly passed to OAI Whisper provider.
8+
* @property {Object} ocr - The OCR options
9+
* @property {{allowAnyIp: "true"|null|undefined}} runtimeSettings - The runtime settings that are passed to the collector. Persisted across requests.
10+
*/
11+
312
// When running locally will occupy the 0.0.0.0 hostname space but when deployed inside
413
// of docker this endpoint is not exposed so it is only on the Docker instances internal network
514
// so no additional security is needed on the endpoint directly. Auth is done however by the express
@@ -15,6 +24,10 @@ class CollectorApi {
1524
console.log(`\x1b[36m[CollectorApi]\x1b[0m ${text}`, ...args);
1625
}
1726

27+
/**
28+
* Attach options to the request passed to the collector API
29+
* @returns {CollectorOptions}
30+
*/
1831
#attachOptions() {
1932
return {
2033
whisperProvider: process.env.WHISPER_PROVIDER || "local",
@@ -23,6 +36,9 @@ class CollectorApi {
2336
ocr: {
2437
langList: process.env.TARGET_OCR_LANG || "eng",
2538
},
39+
runtimeSettings: {
40+
allowAnyIp: process.env.COLLECTOR_ALLOW_ANY_IP ?? "false",
41+
},
2642
};
2743
}
2844

@@ -45,6 +61,12 @@ class CollectorApi {
4561
});
4662
}
4763

64+
/**
65+
* Process a document
66+
* - Will append the options to the request body
67+
* @param {string} filename - The filename of the document to process
68+
* @returns {Promise<Object>} - The response from the collector API
69+
*/
4870
async processDocument(filename = "") {
4971
if (!filename) return false;
5072

@@ -75,10 +97,16 @@ class CollectorApi {
7597
});
7698
}
7799

100+
/**
101+
* Process a link
102+
* - Will append the options to the request body
103+
* @param {string} link - The link to process
104+
* @returns {Promise<Object>} - The response from the collector API
105+
*/
78106
async processLink(link = "") {
79107
if (!link) return false;
80108

81-
const data = JSON.stringify({ link });
109+
const data = JSON.stringify({ link, options: this.#attachOptions() });
82110
return await fetch(`${this.endpoint}/process-link`, {
83111
method: "POST",
84112
headers: {
@@ -101,8 +129,19 @@ class CollectorApi {
101129
});
102130
}
103131

132+
/**
133+
* Process raw text as a document for the collector
134+
* - Will append the options to the request body
135+
* @param {string} textContent - The text to process
136+
* @param {Object} metadata - The metadata to process
137+
* @returns {Promise<Object>} - The response from the collector API
138+
*/
104139
async processRawText(textContent = "", metadata = {}) {
105-
const data = JSON.stringify({ textContent, metadata });
140+
const data = JSON.stringify({
141+
textContent,
142+
metadata,
143+
options: this.#attachOptions(),
144+
});
106145
return await fetch(`${this.endpoint}/process-raw-text`, {
107146
method: "POST",
108147
headers: {
@@ -151,10 +190,21 @@ class CollectorApi {
151190
});
152191
}
153192

193+
/**
194+
* Get the content of a link only in a specific format
195+
* - Will append the options to the request body
196+
* @param {string} link - The link to get the content of
197+
* @param {"text"|"html"} captureAs - The format to capture the content as
198+
* @returns {Promise<Object>} - The response from the collector API
199+
*/
154200
async getLinkContent(link = "", captureAs = "text") {
155201
if (!link) return false;
156202

157-
const data = JSON.stringify({ link, captureAs });
203+
const data = JSON.stringify({
204+
link,
205+
captureAs,
206+
options: this.#attachOptions(),
207+
});
158208
return await fetch(`${this.endpoint}/util/get-link`, {
159209
method: "POST",
160210
headers: {

server/utils/helpers/updateENV.js

+3
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,9 @@ function dumpENV() {
958958

959959
// OCR Language Support
960960
"TARGET_OCR_LANG",
961+
962+
// Collector API common ENV - allows bypassing URL validation checks
963+
"COLLECTOR_ALLOW_ANY_IP",
961964
];
962965

963966
// Simple sanitization of each value to prevent ENV injection via newline or quote escaping.

0 commit comments

Comments
 (0)