Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid creating expensive Path objects in split creation code #24317

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import com.facebook.drift.annotations.ThriftStruct;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.openjdk.jol.info.ClassLayout;

import java.io.IOException;
Expand Down Expand Up @@ -69,15 +68,15 @@ public static HiveFileInfo createHiveFileInfo(LocatedFileStatus locatedFileStatu

@ThriftConstructor
public HiveFileInfo(
String pathString,
String path,
boolean directory,
List<BlockLocation> blockLocations,
long length,
long fileModifiedTime,
Optional<byte[]> extraFileInfo,
Map<String, String> customSplitInfo)
{
this.path = requireNonNull(pathString, "pathString is null");
this.path = requireNonNull(path, "path is null");
this.isDirectory = directory;
this.blockLocations = requireNonNull(blockLocations, "blockLocations is null");
this.length = length;
Expand All @@ -87,9 +86,9 @@ public HiveFileInfo(
}

@ThriftField(1)
public String getPathString()
public String getPath()
{
return path.toString();
return path;
}

@ThriftField(2)
Expand Down Expand Up @@ -128,11 +127,6 @@ public Map<String, String> getCustomSplitInfo()
return customSplitInfo;
}

public Path getPath()
{
return new Path(path);
}

public long getRetainedSizeInBytes()
{
long blockLocationsSizeInBytes = blockLocations.stream().map(BlockLocation::getRetainedSizeInBytes).reduce(0L, Long::sum);
Expand All @@ -141,6 +135,11 @@ public long getRetainedSizeInBytes()
return INSTANCE_SIZE + path.length() + blockLocationsSizeInBytes + extraFileInfoSizeInBytes + customSplitInfoSizeInBytes;
}

public String getFileName()
{
return path.substring(path.lastIndexOf('/') + 1);
}

@Override
public boolean equals(Object o)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import com.google.common.primitives.Shorts;
import com.google.common.primitives.SignedBytes;
import io.airlift.slice.Slice;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
Expand Down Expand Up @@ -76,10 +75,10 @@ public final class HiveBucketing

private HiveBucketing() {}

public static int getVirtualBucketNumber(int bucketCount, Path path)
public static int getVirtualBucketNumber(int bucketCount, String path)
{
// this is equivalent to bucketing the table on a VARCHAR column containing $path
return (hashBytes(0, utf8Slice(path.toString())) & Integer.MAX_VALUE) % bucketCount;
return (hashBytes(0, utf8Slice(path)) & Integer.MAX_VALUE) % bucketCount;
}

public static int getBucket(int bucketCount, List<Type> types, Page page, int position)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,13 @@

import com.facebook.presto.hive.metastore.Storage;
import com.facebook.presto.spi.ColumnHandle;
import com.facebook.presto.spi.PrestoException;
import org.openjdk.jol.info.ClassLayout;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR;
import static io.airlift.slice.SizeOf.sizeOfObjectArray;
import static java.util.Objects.requireNonNull;

Expand All @@ -39,7 +35,7 @@ public class HiveSplitPartitionInfo
private static final int INSTANCE_SIZE = ClassLayout.parseClass(HiveSplitPartitionInfo.class).instanceSize();

private final Storage storage;
private final URI path;
private final String path;
private final List<HivePartitionKey> partitionKeys;
private final String partitionName;
private final int partitionDataColumnCount;
Expand All @@ -53,7 +49,7 @@ public class HiveSplitPartitionInfo

HiveSplitPartitionInfo(
Storage storage,
URI path,
String path,
List<HivePartitionKey> partitionKeys,
String partitionName,
int partitionDataColumnCount,
Expand Down Expand Up @@ -86,17 +82,12 @@ public class HiveSplitPartitionInfo
// and Java URI has a bug where a.resolve(a.relativize(b))
// doesn't equal 'b' if 'a' had any components after the last slash
// https://bugs.openjdk.java.net/browse/JDK-6523089
private static URI ensurePathHasTrailingSlash(URI path)
private static String ensurePathHasTrailingSlash(String path)
{
// since this is the partition path, it's always a directory.
// it's safe to add a trailing slash
if (!path.getPath().endsWith("/")) {
try {
path = new URI(path.toString() + "/");
}
catch (URISyntaxException e) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, e);
}
if (!path.endsWith("/")) {
return path + "/";
}
return path;
}
Expand Down Expand Up @@ -164,7 +155,7 @@ public int decrementAndGetReferences()
return references.decrementAndGet();
}

public URI getPath()
public String getPath()
{
return path;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ public static long parseHiveTimestamp(String value, DateTimeZone timeZone)
return HIVE_TIMESTAMP_PARSER.withZone(timeZone).parseMillis(value);
}

public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, Path path)
public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, String path)
{
if (inputFormat instanceof OrcInputFormat || inputFormat instanceof RCFileInputFormat) {
return true;
Expand All @@ -464,25 +464,25 @@ public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fil
}
try {
method.setAccessible(true);
return (boolean) method.invoke(inputFormat, fileSystem, path);
return (boolean) method.invoke(inputFormat, fileSystem, new Path(path));
}
catch (InvocationTargetException | IllegalAccessException e) {
throw new RuntimeException(e);
}
}

public static boolean isSelectSplittable(InputFormat<?, ?> inputFormat, Path path, boolean s3SelectPushdownEnabled)
public static boolean isSelectSplittable(InputFormat<?, ?> inputFormat, String path, boolean s3SelectPushdownEnabled)
{
// S3 Select supports splitting for uncompressed CSV & JSON files
// Previous checks for supported input formats, SerDes, column types and S3 path
// are reflected by the value of s3SelectPushdownEnabled.
return !s3SelectPushdownEnabled || isUncompressed(inputFormat, path);
}

private static boolean isUncompressed(InputFormat<?, ?> inputFormat, Path path)
private static boolean isUncompressed(InputFormat<?, ?> inputFormat, String path)
{
if (inputFormat instanceof TextInputFormat) {
return !getCompressionCodec((TextInputFormat) inputFormat, path).isPresent();
return !getCompressionCodec((TextInputFormat) inputFormat, new Path(path)).isPresent();
}
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import com.facebook.presto.spi.schedule.NodeSelectionStrategy;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.fs.Path;
import org.openjdk.jol.info.ClassLayout;

import javax.annotation.concurrent.NotThreadSafe;
Expand All @@ -32,8 +31,8 @@
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static io.airlift.slice.SizeOf.sizeOf;
import static io.airlift.slice.SizeOf.sizeOfCharArray;
import static io.airlift.slice.SizeOf.sizeOfObjectArray;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Objects.requireNonNull;

@NotThreadSafe
Expand All @@ -45,7 +44,7 @@ public class InternalHiveSplit
private static final int HOST_ADDRESS_INSTANCE_SIZE = ClassLayout.parseClass(HostAddress.class).instanceSize() +
ClassLayout.parseClass(String.class).instanceSize();

private final byte[] relativeUri;
private final String relativeUri;
private final long end;
private final long fileSize;
private final long fileModifiedTime;
Expand Down Expand Up @@ -100,7 +99,7 @@ public InternalHiveSplit(
requireNonNull(extraFileInfo, "extraFileInfo is null");
requireNonNull(encryptionInformation, "encryptionInformation is null");

this.relativeUri = relativeUri.getBytes(UTF_8);
this.relativeUri = relativeUri;
this.start = start;
this.end = end;
this.fileSize = fileSize;
Expand All @@ -113,7 +112,7 @@ public InternalHiveSplit(
this.partitionInfo = partitionInfo;
this.extraFileInfo = extraFileInfo;
this.customSplitInfo = ImmutableMap
.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null"));
.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null"));

ImmutableList.Builder<List<HostAddress>> addressesBuilder = ImmutableList.builder();
blockEndOffsets = new long[blocks.size()];
Expand All @@ -131,8 +130,7 @@ public InternalHiveSplit(

public String getPath()
{
String relativePathString = new String(relativeUri, UTF_8);
return new Path(partitionInfo.getPath().resolve(relativePathString)).toString();
return partitionInfo.getPath() + relativeUri;
}

public long getStart()
Expand Down Expand Up @@ -254,7 +252,7 @@ public void reset()
public int getEstimatedSizeInBytes()
{
int result = INSTANCE_SIZE;
result += sizeOf(relativeUri);
result += sizeOfCharArray(relativeUri.length());
result += sizeOf(blockEndOffsets);
if (!blockAddresses.isEmpty()) {
result += sizeOfObjectArray(blockAddresses.size());
Expand All @@ -275,7 +273,7 @@ public int getEstimatedSizeInBytes()
public String toString()
{
return toStringHelper(this)
.add("relativeUri", new String(relativeUri, UTF_8))
.add("relativeUri", relativeUri)
.add("start", start)
.add("end", end)
.add("fileSize", fileSize)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ private InternalHiveSplitFactory createInternalHiveSplitFactory(
.map(p -> p.getColumns().size())
.orElseGet(table.getDataColumns()::size);
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition(), partitionName);
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
String location = getPartitionLocation(table, partition.getPartition());
Path path = new Path(location);
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, inputFormatName, false);
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(hdfsContext, path);
Expand All @@ -173,7 +174,7 @@ private InternalHiveSplitFactory createInternalHiveSplitFactory(
false,
new HiveSplitPartitionInfo(
storage,
path.toUri(),
location,
partitionKeys,
partitionName,
partitionDataColumnCount,
Expand Down Expand Up @@ -201,7 +202,7 @@ private void validateManifest(ConnectorSession session, HivePartitionMetadata pa
int fileCount = 0;
while (fileInfoIterator.hasNext()) {
HiveFileInfo fileInfo = fileInfoIterator.next();
String fileName = fileInfo.getPath().getName();
String fileName = fileInfo.getFileName();
if (!manifestFileNames.contains(fileName)) {
throw new PrestoException(
MALFORMED_HIVE_FILE_STATISTICS,
Expand Down
Loading
Loading