2021-11-19 18:22:27 +01:00
import { promises as fs , createWriteStream } from 'fs' ;
import path from 'path' ;
2021-11-19 18:13:45 +01:00
import { inject , injectable } from 'inversify' ;
import sequelize from 'sequelize' ;
import { FileCache } from '../models/index.js' ;
import { TYPES } from '../types.js' ;
import Config from './config.js' ;
2021-12-03 05:01:35 +01:00
import PQueue from 'p-queue' ;
2021-12-03 07:26:36 +01:00
import debug from '../utils/debug.js' ;
2021-11-19 18:13:45 +01:00
@injectable ( )
export default class FileCacheProvider {
2021-12-03 17:28:50 +01:00
private static readonly evictionQueue = new PQueue ( { concurrency : 1 } ) ;
2021-11-19 18:13:45 +01:00
private readonly config : Config ;
constructor ( @inject ( TYPES . Config ) config : Config ) {
this . config = config ;
}
/ * *
* Returns path to cached file if it exists , otherwise throws an error .
* Updates the ` accessedAt ` property of the cached file .
* @param hash lookup key
* /
async getPathFor ( hash : string ) : Promise < string > {
const model = await FileCache . findByPk ( hash ) ;
if ( ! model ) {
throw new Error ( 'File is not cached' ) ;
}
const resolvedPath = path . join ( this . config . CACHE_DIR , hash ) ;
try {
await fs . access ( resolvedPath ) ;
} catch ( _ : unknown ) {
await FileCache . destroy ( { where : { hash } } ) ;
throw new Error ( 'File is not cached' ) ;
}
await model . update ( { accessedAt : new Date ( ) } ) ;
return resolvedPath ;
}
/ * *
* Returns a write stream for the given hash key .
* The stream handles saving a new file and will
2021-11-24 19:03:25 +01:00
* update the database after the stream is closed .
2021-11-19 18:13:45 +01:00
* @param hash lookup key
* /
createWriteStream ( hash : string ) {
const tmpPath = path . join ( this . config . CACHE_DIR , 'tmp' , hash ) ;
const finalPath = path . join ( this . config . CACHE_DIR , hash ) ;
const stream = createWriteStream ( tmpPath ) ;
2021-11-24 19:03:25 +01:00
stream . on ( 'close' , async ( ) = > {
2021-11-19 18:13:45 +01:00
// Only move if size is non-zero (may have errored out)
const stats = await fs . stat ( tmpPath ) ;
if ( stats . size !== 0 ) {
2021-11-30 08:39:40 +01:00
try {
await fs . rename ( tmpPath , finalPath ) ;
2021-11-19 18:13:45 +01:00
2021-11-30 08:39:40 +01:00
await FileCache . create ( { hash , bytes : stats.size , accessedAt : new Date ( ) } ) ;
2021-12-03 16:36:06 +01:00
} catch ( error ) {
debug ( 'Errored when moving a finished cache file:' , error ) ;
2021-11-30 08:39:40 +01:00
}
}
2021-11-19 18:13:45 +01:00
2021-12-03 16:45:09 +01:00
await this . evictOldestIfNecessary ( ) ;
2021-11-19 18:13:45 +01:00
} ) ;
return stream ;
}
/ * *
* Deletes orphaned cache files and evicts files if
* necessary . Should be run on program startup so files
* will be evicted if the cache limit has changed .
* /
async cleanup() {
await this . removeOrphans ( ) ;
2021-12-03 16:45:09 +01:00
await this . evictOldestIfNecessary ( ) ;
2021-11-19 18:13:45 +01:00
}
2021-12-03 16:45:09 +01:00
private async evictOldestIfNecessary() {
2021-12-03 17:28:50 +01:00
void FileCacheProvider . evictionQueue . add ( this . evictOldest . bind ( this ) ) ;
2021-12-03 16:45:09 +01:00
2021-12-03 17:28:50 +01:00
return FileCacheProvider . evictionQueue . onEmpty ( ) ;
2021-12-03 05:01:35 +01:00
}
private async evictOldest() {
2021-12-03 16:52:30 +01:00
debug ( 'Evicting oldest files...' ) ;
2021-11-19 18:13:45 +01:00
2021-12-03 17:06:56 +01:00
let totalSizeBytes = await this . getDiskUsageInBytes ( ) ;
let numOfEvictedFiles = 0 ;
// Continue to evict until we're under the limit
/* eslint-disable no-await-in-loop */
while ( totalSizeBytes > this . config . CACHE_LIMIT_IN_BYTES ) {
2021-11-19 18:13:45 +01:00
const oldest = await FileCache . findOne ( {
order : [
[ 'accessedAt' , 'ASC' ] ,
] ,
} ) ;
if ( oldest ) {
await oldest . destroy ( ) ;
await fs . unlink ( path . join ( this . config . CACHE_DIR , oldest . hash ) ) ;
2021-12-03 16:52:30 +01:00
debug ( ` ${ oldest . hash } has been evicted ` ) ;
2021-12-03 17:06:56 +01:00
numOfEvictedFiles ++ ;
2021-11-19 18:13:45 +01:00
}
2021-12-03 17:06:56 +01:00
totalSizeBytes = await this . getDiskUsageInBytes ( ) ;
}
/* eslint-enable no-await-in-loop */
if ( numOfEvictedFiles > 0 ) {
debug ( ` ${ numOfEvictedFiles } files have been evicted ` ) ;
2021-12-03 16:52:30 +01:00
} else {
debug ( ` No files needed to be evicted. Total size of the cache is currently ${ totalSizeBytes } bytes, and the cache limit is ${ this . config . CACHE_LIMIT_IN_BYTES } bytes. ` ) ;
2021-11-19 18:13:45 +01:00
}
}
private async removeOrphans() {
2021-12-08 02:36:06 +01:00
// Check filesystem direction (do files exist on the disk but not in the database?)
2021-11-19 18:22:27 +01:00
for await ( const dirent of await fs . opendir ( this . config . CACHE_DIR ) ) {
if ( dirent . isFile ( ) ) {
const model = await FileCache . findByPk ( dirent . name ) ;
if ( ! model ) {
2021-12-08 02:36:06 +01:00
debug ( ` ${ dirent . name } was present on disk but was not in the database. Removing from disk. ` ) ;
2021-11-19 18:22:27 +01:00
await fs . unlink ( path . join ( this . config . CACHE_DIR , dirent . name ) ) ;
}
}
}
2021-12-08 02:36:06 +01:00
// Check database direction (do entries exist in the database but not on the disk?)
for await ( const model of this . getFindAllIterable ( ) ) {
const filePath = path . join ( this . config . CACHE_DIR , model . hash ) ;
try {
await fs . access ( filePath ) ;
} catch {
debug ( ` ${ model . hash } was present in database but was not on disk. Removing from database. ` ) ;
await model . destroy ( ) ;
}
}
2021-11-19 18:13:45 +01:00
}
2021-12-03 17:06:56 +01:00
/ * *
* Pulls from the database rather than the filesystem ,
* so may be slightly inaccurate .
* @returns the total size of the cache in bytes
* /
private async getDiskUsageInBytes() {
const [ { dataValues : { totalSizeBytes } } ] = await FileCache . findAll ( {
attributes : [
[ sequelize . fn ( 'sum' , sequelize . col ( 'bytes' ) ) , 'totalSizeBytes' ] ,
] ,
} ) as unknown as [ { dataValues : { totalSizeBytes : number } } ] ;
return totalSizeBytes ;
}
2021-12-08 02:36:06 +01:00
/ * *
* An efficient way to iterate over all rows .
* @returns an iterable for the result of FileCache . findAll ( )
* /
private getFindAllIterable() {
const limit = 50 ;
let previousCreatedAt : Date | null = null ;
let models : FileCache [ ] = [ ] ;
const fetchNextBatch = async ( ) = > {
let where = { } ;
if ( previousCreatedAt ) {
where = {
createdAt : {
[ sequelize . Op . gt ] : previousCreatedAt ,
} ,
} ;
}
models = await FileCache . findAll ( {
where ,
limit ,
order : [ 'createdAt' ] ,
} ) ;
if ( models . length > 0 ) {
previousCreatedAt = models [ models . length - 1 ] . createdAt as Date ;
}
} ;
return {
[ Symbol . asyncIterator ] ( ) {
return {
async next() {
if ( models . length === 0 ) {
await fetchNextBatch ( ) ;
}
if ( models . length === 0 ) {
// Must return value here for types to be inferred correctly
return { done : true , value : null as unknown as FileCache } ;
}
return { value : models.shift ( ) ! , done : false } ;
} ,
} ;
} ,
} ;
}
2021-11-19 18:13:45 +01:00
}