Skip to content

Commit 0e7b273

Browse files
authored
ls cli (#204)
* Working on ls * ls working with tests * Fix test
1 parent 441d56c commit 0e7b273

File tree

2 files changed

+296
-10
lines changed

2 files changed

+296
-10
lines changed

python/hdfs_native/cli.py

+195-2
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,35 @@
66
import stat
77
import sys
88
from argparse import ArgumentParser, Namespace
9+
from collections import defaultdict
910
from concurrent.futures import ThreadPoolExecutor, as_completed
10-
from typing import List, Optional, Sequence, Tuple
11+
from datetime import datetime
12+
from typing import Dict, List, Optional, Sequence, Tuple, Union
1113
from urllib.parse import urlparse
1214

1315
from hdfs_native import Client
14-
from hdfs_native._internal import WriteOptions
16+
from hdfs_native._internal import FileStatus, WriteOptions
17+
18+
__all__ = ["main"]
1519

1620

1721
@functools.cache
1822
def _get_client(connection_url: Optional[str] = None):
1923
return Client(connection_url)
2024

2125

26+
def _prefix_for_url(url: str) -> str:
27+
parsed = urlparse(url)
28+
29+
if parsed.scheme:
30+
prefix = f"{parsed.scheme}://{parsed.hostname}"
31+
if parsed.port:
32+
prefix += f":{parsed.port}"
33+
return prefix
34+
35+
return ""
36+
37+
2238
def _client_for_url(url: str) -> Client:
2339
parsed = urlparse(url)
2440

@@ -217,6 +233,125 @@ def get(args: Namespace):
217233
f.result()
218234

219235

236+
def ls(args: Namespace):
237+
def human_size(num: int):
238+
if num < 1024:
239+
return str(num)
240+
241+
adjusted = num / 1024.0
242+
for unit in ("K", "M", "G", "T", "P", "E", "Z"):
243+
if abs(adjusted) < 1024.0:
244+
return f"{adjusted:.1f} {unit}"
245+
adjusted /= 1024.0
246+
return f"{adjusted:.1f} Y"
247+
248+
def parse_status(status: FileStatus, prefix: str) -> Dict[str, Union[int, str]]:
249+
file_time = status.modification_time
250+
if args.access_time:
251+
file_time = status.access_time
252+
253+
file_time_string = datetime.fromtimestamp(file_time / 1000).strftime(
254+
r"%Y-%m-%d %H:%M"
255+
)
256+
257+
permission = status.permission
258+
if status.isdir:
259+
permission |= stat.S_IFDIR
260+
else:
261+
permission |= stat.S_IFREG
262+
263+
mode = stat.filemode(permission)
264+
265+
if args.human_readable:
266+
length_string = human_size(status.length)
267+
else:
268+
length_string = str(status.length)
269+
270+
path = prefix + status.path
271+
272+
return {
273+
"mode": mode,
274+
"replication": str(status.replication) if status.replication else "-",
275+
"owner": status.owner,
276+
"group": status.group,
277+
"length": status.length,
278+
"length_formatted": length_string,
279+
"time": file_time,
280+
"time_formatted": file_time_string,
281+
"path": path,
282+
}
283+
284+
def get_widths(parsed: list[dict]) -> dict[str, int]:
285+
widths: dict[str, int] = defaultdict(lambda: 0)
286+
287+
for file in parsed:
288+
for key, value in file.items():
289+
if isinstance(value, str):
290+
widths[key] = max(widths[key], len(value))
291+
292+
return widths
293+
294+
def print_files(
295+
parsed: List[Dict[str, Union[int, str]]],
296+
widths: Optional[Dict[str, int]] = None,
297+
):
298+
if args.sort_time:
299+
parsed = sorted(parsed, key=lambda x: x["time"], reverse=not args.reverse)
300+
elif args.sort_size:
301+
parsed = sorted(parsed, key=lambda x: x["length"], reverse=not args.reverse)
302+
303+
def format(
304+
file: Dict[str, Union[int, str]],
305+
field: str,
306+
right_align: bool = False,
307+
):
308+
value = str(file[field])
309+
310+
width = len(value)
311+
if widths and field in widths:
312+
width = widths[field]
313+
314+
if right_align:
315+
return f"{value:>{width}}"
316+
return f"{value:{width}}"
317+
318+
for file in parsed:
319+
if args.path_only:
320+
print(file["path"])
321+
else:
322+
formatted_fields = [
323+
format(file, "mode"),
324+
format(file, "replication"),
325+
format(file, "owner"),
326+
format(file, "group"),
327+
format(file, "length_formatted", True),
328+
format(file, "time_formatted"),
329+
format(file, "path"),
330+
]
331+
print(" ".join(formatted_fields))
332+
333+
for url in args.path:
334+
client = _client_for_url(url)
335+
for path in _glob_path(client, _path_for_url(url)):
336+
status = client.get_file_info(path)
337+
338+
prefix = _prefix_for_url(url)
339+
340+
if status.isdir:
341+
parsed = [
342+
parse_status(status, prefix)
343+
for status in client.list_status(path, args.recursive)
344+
]
345+
346+
if not args.path_only:
347+
print(f"Found {len(parsed)} items")
348+
349+
widths = get_widths(parsed)
350+
print_files(parsed, widths)
351+
else:
352+
print_files([parse_status(status, prefix)])
353+
354+
220355
def mkdir(args: Namespace):
221356
create_parent = args.parent
222357

@@ -427,6 +562,64 @@ def main(in_args: Optional[Sequence[str]] = None):
427562
)
428563
get_parser.set_defaults(func=get)
429564

565+
ls_parser = subparsers.add_parser(
566+
"ls",
567+
help="List contents that match the specified patterns",
568+
description="""List contents that match the specified patterns. For a directory, list its
569+
direct children.""",
570+
)
571+
ls_parser.add_argument(
572+
"-C",
573+
"--path-only",
574+
action="store_true",
575+
default=False,
576+
help="Display the path of files and directories only.",
577+
)
578+
ls_parser.add_argument(
579+
"-H",
580+
"--human-readable",
581+
action="store_true",
582+
default=False,
583+
help="Formats the sizes of files in a human-readable fashion rather than a number of bytes",
584+
)
585+
ls_parser.add_argument(
586+
"-R",
587+
"--recursive",
588+
action="store_true",
589+
default=False,
590+
help="Recursively list the contents of directories",
591+
)
592+
ls_parser.add_argument(
593+
"-t",
594+
"--sort-time",
595+
action="store_true",
596+
default=False,
597+
help="Sort files by modification time (most recent first)",
598+
)
599+
ls_parser.add_argument(
600+
"-S",
601+
"--sort-size",
602+
action="store_true",
603+
default=False,
604+
help="Sort files by size (largest first)",
605+
)
606+
ls_parser.add_argument(
607+
"-r",
608+
"--reverse",
609+
action="store_true",
610+
default=False,
611+
help="Reverse the order of the sort",
612+
)
613+
ls_parser.add_argument(
614+
"-u",
615+
"--access-time",
616+
action="store_true",
617+
default=False,
618+
help="Use the last access time instead of modification time for display and sorting",
619+
)
620+
ls_parser.add_argument("path", nargs="+", help="Path to display contents of")
621+
ls_parser.set_defaults(func=ls)
622+
430623
mkdir_parser = subparsers.add_parser(
431624
"mkdir",
432625
help="Create a directory",

python/tests/test_cli.py

+101-8
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import contextlib
2+
import dataclasses
23
import io
34
import os
5+
import re
46
import stat
57
from tempfile import TemporaryDirectory
8+
from typing import Callable, Iterator, List, Literal, Optional, Tuple, overload
69

710
import pytest
811

@@ -18,22 +21,37 @@ def assert_not_exists(client: Client, path: str):
1821
pass
1922

2023

24+
@overload
25+
def capture_stdout(func: Callable[[], None], text: Literal[False]) -> bytes: ...
26+
27+
28+
@overload
29+
def capture_stdout(func: Callable[[], None], text: Literal[True] = True) -> str: ...
30+
31+
32+
def capture_stdout(func: Callable[[], None], text: bool = True):
33+
buf = io.BytesIO()
34+
with contextlib.redirect_stdout(io.TextIOWrapper(buf)) as wrapper:
35+
func()
36+
if text:
37+
wrapper.seek(0)
38+
return wrapper.read()
39+
else:
40+
return buf.getvalue()
41+
42+
2143
def test_cat(client: Client):
2244
with client.create("/testfile") as file:
2345
file.write(b"1234")
2446

25-
buf = io.BytesIO()
26-
with contextlib.redirect_stdout(io.TextIOWrapper(buf)):
27-
cli_main(["cat", "/testfile"])
28-
assert buf.getvalue() == b"1234"
47+
output = capture_stdout(lambda: cli_main(["cat", "/testfile"]), False)
48+
assert output == b"1234"
2949

3050
with client.create("/testfile2") as file:
3151
file.write(b"5678")
3252

33-
buf = io.BytesIO()
34-
with contextlib.redirect_stdout(io.TextIOWrapper(buf)):
35-
cli_main(["cat", "/testfile", "/testfile2"])
36-
assert buf.getvalue() == b"12345678"
53+
output = capture_stdout(lambda: cli_main(["cat", "/testfile", "/testfile2"]), False)
54+
assert output == b"12345678"
3755

3856
with pytest.raises(FileNotFoundError):
3957
cli_main(["cat", "/nonexistent"])
@@ -159,6 +177,81 @@ def test_get(client: Client):
159177
assert file.read() == data
160178

161179

180+
def test_ls(client: Client):
181+
@dataclasses.dataclass
182+
class FileOutput:
183+
permission: str
184+
replication: str
185+
size: str
186+
path: str
187+
188+
def parse_output(output: str) -> Iterator[Tuple[int, List[FileOutput]]]:
189+
current_items: Optional[int] = None
190+
current_batch: List[FileOutput] = []
191+
192+
for line in output.split("\n"):
193+
if match := re.match(r"Found (\d)+ items", line):
194+
if current_items is not None:
195+
yield (current_items, current_batch)
196+
197+
current_items = int(match.group(1))
198+
current_batch = []
199+
200+
elif line.strip():
201+
match = re.match(
202+
r"(\S+)\s+(\S+)\s+\S+\s+\S+\s+([0-9.]+(?: \w)?)\s+\S+\s+\S+\s+(\S+)",
203+
line,
204+
)
205+
assert match is not None
206+
current_batch.append(
207+
FileOutput(
208+
permission=match.group(1),
209+
replication=match.group(2),
210+
size=match.group(3),
211+
path=match.group(4),
212+
)
213+
)
214+
215+
if current_items is not None and len(current_batch) > 0:
216+
yield (current_items, current_batch)
217+
218+
with pytest.raises(FileNotFoundError):
219+
cli_main(["ls", "/fake"])
220+
221+
with client.create("/testfile1") as f:
222+
f.write(bytes(range(10)))
223+
224+
with client.create("/testfile2") as f:
225+
for i in range(1024):
226+
f.write(i.to_bytes(4, "big"))
227+
228+
client.mkdirs("/testdir")
229+
230+
directory = FileOutput("drwxr-xr-x", "-", "0", "/testdir")
231+
file1 = FileOutput("-rw-r--r--", "3", "10", "/testfile1")
232+
file2 = FileOutput("-rw-r--r--", "3", "4096", "/testfile2")
233+
234+
def check_output(command: List[str], expected: List[FileOutput]):
235+
groups = list(parse_output(capture_stdout(lambda: cli_main(command))))
236+
assert len(groups) == 1
237+
assert groups[0][0] == 3
238+
assert len(groups[0][1]) == 3
239+
assert groups[0][1] == expected
240+
241+
check_output(["ls", "/"], [directory, file1, file2])
242+
check_output(["ls", "-t", "/"], [directory, file2, file1])
243+
check_output(["ls", "-r", "-t", "/"], [file1, file2, directory])
244+
check_output(["ls", "-S", "/"], [file2, file1, directory])
245+
check_output(["ls", "-r", "-S", "/"], [directory, file1, file2])
246+
247+
check_output(
248+
["ls", "-H", "/"], [directory, file1, dataclasses.replace(file2, size="4.0 K")]
249+
)
250+
251+
output = capture_stdout(lambda: cli_main(["ls", "-C", "/"])).strip().split("\n")
252+
assert output == [directory.path, file1.path, file2.path]
253+
254+
162255
def test_mkdir(client: Client):
163256
cli_main(["mkdir", "/testdir"])
164257
assert client.get_file_info("/testdir").isdir

0 commit comments

Comments
 (0)