import json
import heapq
def sort_json_records(json_file_path, key_path, sort_order='asc', chunk_size=1000):
"""
Sorts records of JSON objects from a file with limited memory usage.
Args:
json_file_path (str): Path to the JSON file.
key_path (str): Path to the key used for sorting (e.g., "field1.field2").
sort_order (str): 'asc' for ascending, 'desc' for descending.
chunk_size (int): Number of records to process in each chunk.
Returns:
list: Sorted list of JSON objects.
"""
results = []
with open(json_file_path, 'r') as f:
for chunk in read_in_chunks(f, chunk_size):
for record in chunk:
try:
record_dict = json.loads(record)
value = get_value_from_path(record_dict, key_path)
if sort_order == 'asc':
heapq.heappush(results, (value, record_dict))
elif sort_order == 'desc':
heapq.heappush(results, (-value, record_dict)) #Negate for descending order
else:
raise ValueError("Invalid sort_order. Use 'asc' or 'desc'.")
except (json.JSONDecodeError, KeyError, TypeError) as e:
print(f"Skipping invalid record: {record}. Error: {e}")
continue
# Extract sorted records from the heap
sorted_records = [record for _, record in heapq.nlargest(len(results), results)]
return sorted_records
def read_in_chunks(file_object, chunk_size):
"""
Helper function to read a file in chunks.
"""
while True:
chunk = []
for _ in range(chunk_size):
line = file_object.readline()
if not line:
break
chunk.append(line)
if not chunk:
break
yield chunk
def get_value_from_path(data, key_path):
"""
Helper function to get a value from a nested dictionary using a path.
"""
keys = key_path.split('.')
value = data
for key in keys:
if isinstance(value, dict) and key in value:
value = value[key]
else:
raise KeyError(f"Key '{key}' not found in data.")
return value
Add your comment