Skip to content

Enable bypass of ip limitations via ENV in collector processing #3652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 21, 2025
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['na'] # put your current branch to create a build. Core team only.
branches: ['3625-bypass-ip-check'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
9 changes: 7 additions & 2 deletions collector/middleware/verifyIntegrity.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
const { CommunicationKey } = require("../utils/comKey");
const RuntimeSettings = require("../utils/runtimeSettings");
const runtimeSettings = new RuntimeSettings();

function verifyPayloadIntegrity(request, response, next) {
const comKey = new CommunicationKey();
if (process.env.NODE_ENV === "development") {
comKey.log('verifyPayloadIntegrity is skipped in development.')
comKey.log('verifyPayloadIntegrity is skipped in development.');
runtimeSettings.parseOptionsFromRequest(request);
next();
return;
}
Expand All @@ -12,7 +15,9 @@ function verifyPayloadIntegrity(request, response, next) {
if (!signature) return response.status(400).json({ msg: 'Failed integrity signature check.' })

const validSignedPayload = comKey.verify(signature, request.body);
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' })
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' });

runtimeSettings.parseOptionsFromRequest(request);
next();
}

Expand Down
83 changes: 83 additions & 0 deletions collector/utils/runtimeSettings/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
const { reqBody } = require("../http");

/**
* Runtime settings are used to configure the collector per-request.
* These settings are persisted across requests, but can be overridden per-request.
*
* The settings are passed in the request body via `options.runtimeSettings`
* which is set in the backend #attachOptions function in CollectorApi.
*
* We do this so that the collector and backend can share the same ENV variables
* but only pass the relevant settings to the collector per-request and be able to
* access them across the collector via a single instance of RuntimeSettings.
*
* TODO: We may want to set all options passed from backend to collector here,
* but for now - we are only setting the runtime settings specifically for backwards
* compatibility with existing CollectorApi usage.
*/
class RuntimeSettings {
static _instance = null;
settings = {};

// Any settings here will be persisted across requests
// and must be explicitly defined here.
settingConfigs = {
allowAnyIp: {
default: false,
// Value must be explicitly "true" or "false" as a string
validate: (value) => String(value) === "true",
},
};

constructor() {
if (RuntimeSettings._instance) return RuntimeSettings._instance;
RuntimeSettings._instance = this;
return this;
}

/**
* Parse the runtime settings from the request body options body
* see #attachOptions https://github.com/Mintplex-Labs/anything-llm/blob/ebf112007e0d579af3d2b43569db95bdfc59074b/server/utils/collectorApi/index.js#L18
* @param {import('express').Request} request
* @returns {void}
*/
parseOptionsFromRequest(request = {}) {
const options = reqBody(request)?.options?.runtimeSettings || {};
for (const [key, value] of Object.entries(options)) {
if (!this.settingConfigs.hasOwnProperty(key)) continue;
this.set(key, value);
}
return;
}

/**
* Get a runtime setting
* - Will throw an error if the setting requested is not a supported runtime setting key
* - Will return the default value if the setting requested is not set at all
* @param {string} key
* @returns {any}
*/
get(key) {
if (!this.settingConfigs[key])
throw new Error(`Invalid runtime setting: ${key}`);
return this.settings.hasOwnProperty(key)
? this.settings[key]
: this.settingConfigs[key].default;
}

/**
* Set a runtime setting
* - Will throw an error if the setting requested is not a supported runtime setting key
* - Will validate the value against the setting's validate function
* @param {string} key
* @param {any} value
* @returns {void}
*/
set(key, value = null) {
if (!this.settingConfigs[key])
throw new Error(`Invalid runtime setting: ${key}`);
this.settings[key] = this.settingConfigs[key].validate(value);
}
}

module.exports = RuntimeSettings;
18 changes: 18 additions & 0 deletions collector/utils/url/index.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
const RuntimeSettings = require("../runtimeSettings");
/** ATTN: SECURITY RESEARCHERS
* To Security researchers about to submit an SSRF report CVE - please don't.
* We are aware that the code below is does not defend against any of the thousands of ways
Expand All @@ -13,15 +14,24 @@

const VALID_PROTOCOLS = ["https:", "http:"];
const INVALID_OCTETS = [192, 172, 10, 127];
const runtimeSettings = new RuntimeSettings();

/**
* If an ip address is passed in the user is attempting to collector some internal service running on internal/private IP.
* This is not a security feature and simply just prevents the user from accidentally entering invalid IP addresses.
* Can be bypassed via COLLECTOR_ALLOW_ANY_IP environment variable.
* @param {URL} param0
* @param {URL['hostname']} param0.hostname
* @returns {boolean}
*/
function isInvalidIp({ hostname }) {
if (runtimeSettings.get("allowAnyIp")) {
console.log(
"\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
);
return false;
}

const IPRegex = new RegExp(
/^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$/gi
);
Expand All @@ -40,6 +50,14 @@ function isInvalidIp({ hostname }) {
return INVALID_OCTETS.includes(Number(octetOne));
}

/**
* Validates a URL
* - Checks the URL forms a valid URL
* - Checks the URL is at least HTTP(S)
* - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
* @param {string} url
* @returns {boolean}
*/
function validURL(url) {
try {
const destination = new URL(url);
Expand Down
4 changes: 4 additions & 0 deletions docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,10 @@ GID='1000'
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1

# Allow scraping of any IP address in collector - must be string "true" to be enabled
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
# COLLECTOR_ALLOW_ANY_IP="true"

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
Expand Down
4 changes: 4 additions & 0 deletions server/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ TTS_PROVIDER="native"
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1

# Allow scraping of any IP address in collector - must be string "true" to be enabled
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
# COLLECTOR_ALLOW_ANY_IP="true"

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
Expand Down
56 changes: 53 additions & 3 deletions server/utils/collectorApi/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
const { EncryptionManager } = require("../EncryptionManager");

/**
* @typedef {Object} CollectorOptions
* @property {string} whisperProvider - The provider to use for whisper, defaults to "local"
* @property {string} WhisperModelPref - The model to use for whisper if set.
* @property {string} openAiKey - The API key to use for OpenAI interfacing, mostly passed to OAI Whisper provider.
* @property {Object} ocr - The OCR options
* @property {{allowAnyIp: "true"|null|undefined}} runtimeSettings - The runtime settings that are passed to the collector. Persisted across requests.
*/

// When running locally will occupy the 0.0.0.0 hostname space but when deployed inside
// of docker this endpoint is not exposed so it is only on the Docker instances internal network
// so no additional security is needed on the endpoint directly. Auth is done however by the express
Expand All @@ -15,6 +24,10 @@ class CollectorApi {
console.log(`\x1b[36m[CollectorApi]\x1b[0m ${text}`, ...args);
}

/**
* Attach options to the request passed to the collector API
* @returns {CollectorOptions}
*/
#attachOptions() {
return {
whisperProvider: process.env.WHISPER_PROVIDER || "local",
Expand All @@ -23,6 +36,9 @@ class CollectorApi {
ocr: {
langList: process.env.TARGET_OCR_LANG || "eng",
},
runtimeSettings: {
allowAnyIp: process.env.COLLECTOR_ALLOW_ANY_IP ?? "false",
},
};
}

Expand All @@ -45,6 +61,12 @@ class CollectorApi {
});
}

/**
* Process a document
* - Will append the options to the request body
* @param {string} filename - The filename of the document to process
* @returns {Promise<Object>} - The response from the collector API
*/
async processDocument(filename = "") {
if (!filename) return false;

Expand Down Expand Up @@ -75,10 +97,16 @@ class CollectorApi {
});
}

/**
* Process a link
* - Will append the options to the request body
* @param {string} link - The link to process
* @returns {Promise<Object>} - The response from the collector API
*/
async processLink(link = "") {
if (!link) return false;

const data = JSON.stringify({ link });
const data = JSON.stringify({ link, options: this.#attachOptions() });
return await fetch(`${this.endpoint}/process-link`, {
method: "POST",
headers: {
Expand All @@ -101,8 +129,19 @@ class CollectorApi {
});
}

/**
* Process raw text as a document for the collector
* - Will append the options to the request body
* @param {string} textContent - The text to process
* @param {Object} metadata - The metadata to process
* @returns {Promise<Object>} - The response from the collector API
*/
async processRawText(textContent = "", metadata = {}) {
const data = JSON.stringify({ textContent, metadata });
const data = JSON.stringify({
textContent,
metadata,
options: this.#attachOptions(),
});
return await fetch(`${this.endpoint}/process-raw-text`, {
method: "POST",
headers: {
Expand Down Expand Up @@ -151,10 +190,21 @@ class CollectorApi {
});
}

/**
* Get the content of a link only in a specific format
* - Will append the options to the request body
* @param {string} link - The link to get the content of
* @param {"text"|"html"} captureAs - The format to capture the content as
* @returns {Promise<Object>} - The response from the collector API
*/
async getLinkContent(link = "", captureAs = "text") {
if (!link) return false;

const data = JSON.stringify({ link, captureAs });
const data = JSON.stringify({
link,
captureAs,
options: this.#attachOptions(),
});
return await fetch(`${this.endpoint}/util/get-link`, {
method: "POST",
headers: {
Expand Down
3 changes: 3 additions & 0 deletions server/utils/helpers/updateENV.js
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,9 @@ function dumpENV() {

// OCR Language Support
"TARGET_OCR_LANG",

// Collector API common ENV - allows bypassing URL validation checks
"COLLECTOR_ALLOW_ANY_IP",
];

// Simple sanitization of each value to prevent ENV injection via newline or quote escaping.
Expand Down